diff --git a/.dockerignore b/.dockerignore index 6f94157..fee4dbe 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,3 +6,9 @@ # Omit Python cache files. __pycache__/ + +# Ignore pytest cache files. +.pytest_cache/ + +# Ignore ruff cache files. +.ruff_cache/ \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 10f2458..806c638 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: - name: Check formatting run: uv run ruff format --check -- src/ - - name: Linting + - name: Lint run: uv run ruff check -- src/ # Note: This spins up containers running the default services. @@ -47,8 +47,8 @@ jobs: - name: Spin up Docker Compose stack in background run: docker compose up --detach - # Note: The `--exit-code-from test` option applies the exit code of the `test` container - # to the `docker compose` process, so that the GHA step fails if tests fail. + # Note: The `--exit-code-from test` option applies the exit code of the `ingest` container + # to the `docker compose` process, so that the GHA step fails if ingest fails. # Reference: https://docs.docker.com/reference/cli/docker/compose/up/ - name: Spin up `test` container run: docker compose up --exit-code-from test test diff --git a/.gitignore b/.gitignore index e82b666..1252056 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,12 @@ __pycache__ # Top-level environment configuration file. /.env + +# Ignore pytest cache files. +/.pytest_cache/ + +# Ignore ruff cache files. +/.ruff_cache/ + +# Ignore Vite files. +/.vite/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8dcafec..a9fed31 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -68,6 +68,7 @@ to make sure the Python virtual environment has the updated dependencies. ## Spin up container-based development environment +### Start the server This repository includes a container-based development environment. If you have Docker installed, you can spin up that development environment by running: ```sh @@ -76,4 +77,45 @@ docker compose up --detach Once that's up and running, you can access the API at: http://localhost:8000 -Also, you can access the MongoDB server at: `localhost:27017` (its admin credentials are in `docker-compose.yml`) \ No newline at end of file +Also, you can access the MongoDB server at: `localhost:27017` (its admin credentials are in `docker-compose.yml`) + +### Run Ingest +To populate the database with data run +```sh +docker compose run --volume /path/to/data:/data --rm ingest \ + uv run --active \ + python /app/mongodb/ingest_data.py \ + --mongo-uri "mongodb://${MONGO_USERNAME}:${MONGO_PASSWORD}@${MONGO_HOST}:${MONGO_PORT}" \ + --input /data --clean +``` +(See `docker-compose.yml` for details) + +Or if you want to use data in tests/data simply use: +```sh +docker compose up ingest +``` + +### Run Tests + +Run the tests: + +```sh +docker compose up test +``` + +
+Show/hide FAQ about the ingest script's role in testing + +Note: The test suite includes a fixture, named `seeded_db`, that will invoke the ingest script automatically before each test that specifies that fixture as a dependency. + +```py +def test_foo(seeded_db): + # The ingest script will be invoked automatically before this test runs. + pass + +def test_foo() + # The ingest script will _not_ be invoked automatically before this test runs. + pass +``` + +
diff --git a/Dockerfile b/Dockerfile index 9025baf..7f351eb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,4 +43,19 @@ COPY . /app # Run the FastAPI development server on port 8000, accepting HTTP requests from any host. # Reference: https://fastapi.tiangolo.com/deployment/manually/ -CMD [ "uv", "run", "fastapi", "dev", "--host", "0.0.0.0", "/app/src/server.py" ] \ No newline at end of file +CMD [ "uv", "run", "fastapi", "dev", "--host", "0.0.0.0", "/app/src/server.py" ] + +# ────────────────────────────────────────────────────────────────────────────┐ +FROM development AS test +# ────────────────────────────────────────────────────────────────────────────┘ + +# Create a local virtual environment directory +# This is necessary for keeping the test environment isolated from +# running server environment in /app/.venv +RUN mkdir -p /app_venv +ENV VIRTUAL_ENV="/app_venv" + +# This target inherits from development and is used for running tests +# No additional setup needed as development already has dev dependencies +# --active flag ensures that the local virtual environment is used +CMD [ "uv", "run", "--active", "pytest", "-v" ] \ No newline at end of file diff --git a/demo/bertron_demo.ipynb b/demo/bertron_demo.ipynb deleted file mode 100644 index 6cdbca7..0000000 --- a/demo/bertron_demo.ipynb +++ /dev/null @@ -1,1245 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e8827a9d", - "metadata": {}, - "source": [ - "# BERtron API Client Showcase\n", - "\n", - "This notebook demonstrates the full functionality of the BERtron Python client, including:\n", - "- Connecting to the BERtron API\n", - "- Retrieving entity data using various query methods\n", - "- Loading data into pandas DataFrames for analysis\n", - "- Performing geospatial queries and visualizations\n", - "- Working with pydantic Entity objects for type safety" - ] - }, - { - "cell_type": "markdown", - "id": "4549c76d", - "metadata": {}, - "source": [ - "## 1. Import Required Libraries\n", - "\n", - "First, let's import all the necessary libraries for our demonstration." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "164e201b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ All libraries imported successfully!\n", - "📊 Ready to showcase BERtron client functionality\n" - ] - } - ], - "source": [ - "# Import the BERtron client and related modules\n", - "import sys\n", - "sys.path.append('/Users/shreyas/Dev/git/bertron/src')\n", - "\n", - "from bertron_client import BertronClient, BertronAPIError, QueryResponse\n", - "from schema.datamodel.bertron_schema_pydantic import Entity, BERSourceType, EntityType\n", - "\n", - "# Import data analysis and visualization libraries\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from typing import List, Dict, Any\n", - "\n", - "# Set up matplotlib for inline plotting\n", - "%matplotlib inline\n", - "plt.style.use('default')\n", - "sns.set_palette(\"husl\")\n", - "\n", - "# Configure pandas display options\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_rows', 20)\n", - "pd.set_option('display.width', None)\n", - "\n", - "print(\"✅ All libraries imported successfully!\")\n", - "print(\"📊 Ready to showcase BERtron client functionality\")" - ] - }, - { - "cell_type": "markdown", - "id": "de658b26", - "metadata": {}, - "source": [ - "## 2. Initialize BERtron Client\n", - "\n", - "Let's create a BERtron client instance and test the connection to the API server." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "f8494634", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔗 Connection Status:\n", - " Web Server: ok\n", - " Database: True\n", - "✅ BERtron API is healthy and ready!\n" - ] - } - ], - "source": [ - "# Initialize the BERtron client\n", - "client = BertronClient(base_url=\"http://localhost:8000\")\n", - "\n", - "# Test the connection with a health check\n", - "try:\n", - " health_status = client.health_check()\n", - " print(\"🔗 Connection Status:\")\n", - " print(f\" Web Server: {health_status['web_server']}\")\n", - " print(f\" Database: {health_status['database']}\")\n", - " print(\"✅ BERtron API is healthy and ready!\")\n", - " \n", - "except BertronAPIError as e:\n", - " print(f\"❌ API Connection Error: {e}\")\n", - "except Exception as e:\n", - " print(f\"❌ Unexpected Error: {e}\")\n", - " print(\"Make sure the BERtron server is running on localhost:8000\")" - ] - }, - { - "cell_type": "markdown", - "id": "3818390f", - "metadata": {}, - "source": [ - "## 3. Retrieve All Entities\n", - "\n", - "Let's fetch all entities from the BERtron database and examine the data structure." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "6ef3a986", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📊 Total entities found: 5\n", - "📁 Response type: \n", - "🔍 First entity type: \n", - "🔍 First entity: DSNY_CoreB_TOP\n", - "🔍 Entity ID: nmdc:bsm-11-bsf8yq62\n", - "🔍 Data source: NMDC\n", - "🔍 Entity types: ['sample']\n", - "🔍 Coordinates: lat=28.125842, lng=-81.434174\n", - "\n", - "📋 Available entity attributes:\n", - " • alt_ids: NoneType\n", - " • alt_names: NoneType\n", - " • ber_data_source: str\n", - " • coordinates: Coordinates\n", - " • description: str\n", - " • entity_type: list\n", - " • id: str\n", - " • linkml_meta: LinkMLMeta\n", - " • model_computed_fields: dict\n", - " • model_config: dict\n", - " • model_extra: NoneType\n", - " • model_fields: dict\n", - " • model_fields_set: set\n", - " • name: str\n", - " • part_of_collection: NoneType\n", - " • uri: str\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/f1/z8zqsl31799cg7s80k011y1w000h39/T/ipykernel_11946/3416257218.py:19: PydanticDeprecatedSince211: Accessing the 'model_computed_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.\n", - " if not attr.startswith('_') and not callable(getattr(first_entity, attr)):\n", - "/var/folders/f1/z8zqsl31799cg7s80k011y1w000h39/T/ipykernel_11946/3416257218.py:20: PydanticDeprecatedSince211: Accessing the 'model_computed_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.\n", - " value = getattr(first_entity, attr)\n", - "/var/folders/f1/z8zqsl31799cg7s80k011y1w000h39/T/ipykernel_11946/3416257218.py:19: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.\n", - " if not attr.startswith('_') and not callable(getattr(first_entity, attr)):\n", - "/var/folders/f1/z8zqsl31799cg7s80k011y1w000h39/T/ipykernel_11946/3416257218.py:20: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.\n", - " value = getattr(first_entity, attr)\n" - ] - } - ], - "source": [ - "# Get all entities from the database\n", - "all_entities_response = client.get_all_entities()\n", - "\n", - "print(f\"📊 Total entities found: {all_entities_response.count}\")\n", - "print(f\"📁 Response type: {type(all_entities_response)}\")\n", - "\n", - "if all_entities_response.entities:\n", - " first_entity = all_entities_response.entities[0]\n", - " print(f\"🔍 First entity type: {type(first_entity)}\")\n", - " print(f\"🔍 First entity: {first_entity.name}\")\n", - " print(f\"🔍 Entity ID: {first_entity.id}\")\n", - " print(f\"🔍 Data source: {first_entity.ber_data_source}\")\n", - " print(f\"🔍 Entity types: {first_entity.entity_type}\")\n", - " print(f\"🔍 Coordinates: lat={first_entity.coordinates.latitude}, lng={first_entity.coordinates.longitude}\")\n", - " \n", - " # Show all available attributes\n", - " print(f\"\\n📋 Available entity attributes:\")\n", - " for attr in dir(first_entity):\n", - " if not attr.startswith('_') and not callable(getattr(first_entity, attr)):\n", - " value = getattr(first_entity, attr)\n", - " print(f\" • {attr}: {type(value).__name__}\")\n", - "else:\n", - " print(\"⚠️ No entities found in the database\")" - ] - }, - { - "cell_type": "markdown", - "id": "8777f9eb", - "metadata": {}, - "source": [ - "## 4. Convert Entities to Pandas DataFrame\n", - "\n", - "Now let's convert the entity data into a pandas DataFrame for easier analysis and manipulation." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "d6b5be94", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📊 DataFrame shape: (5, 15)\n", - "📋 Columns: ['id', 'name', 'uri', 'ber_data_source', 'description', 'entity_types', 'latitude', 'longitude', 'elevation', 'elevation_unit', 'depth', 'depth_unit', 'alt_ids_count', 'alt_names_count', 'collections_count']\n", - "\n", - "🔍 First few rows:\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnameuriber_data_sourcedescriptionentity_typeslatitudelongitudeelevationelevation_unitdepthdepth_unitalt_ids_countalt_names_countcollections_count
0nmdc:bsm-11-bsf8yq62DSNY_CoreB_TOPhttps://api.microbiomedata.org/biosamples/nmdc...NMDCMONet sample represented in NMDCsample28.125842-81.43417424.000mNonem000
1MONET:072e85bf-4a43-4212-83dc-108bb262620cMONet Core 60920_7https://sc-data.emsl.pnnl.gov/monetMONETNonesample68.633578-149.632826722.613unknownNoneNone000
2EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488EMSL Sample c9405190-e962-4ba5-93f0-e3ff499f4488https://sc-data.emsl.pnnl.gov/?projectId=61815EMSLClostridium thermocellum protein extractssample34.000000118.000000NaNNoneNoneNone000
3doi:10.15485/2441497NGEE Arctic Council Site, Mile Marker 71, Alaskahttps://data.ess-dive.lbl.gov/view/doi:10.1548...ESS-DIVEMaps of land surface phenology derived from Pl...unspecified64.847286-163.719936NaNNoneNoneNone100
4Gb0051341Hot spring microbial communities from Yellowst...https://gold.jgi.doe.gov/biosample?id=Gb0051341JGISmall acidic pool on hillside north of Nymph L...jgi_biosample44.752321-110.7253932280.000meter (UO:0000008)NoneNone120
\n", - "
" - ], - "text/plain": [ - " id \\\n", - "0 nmdc:bsm-11-bsf8yq62 \n", - "1 MONET:072e85bf-4a43-4212-83dc-108bb262620c \n", - "2 EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488 \n", - "3 doi:10.15485/2441497 \n", - "4 Gb0051341 \n", - "\n", - " name \\\n", - "0 DSNY_CoreB_TOP \n", - "1 MONet Core 60920_7 \n", - "2 EMSL Sample c9405190-e962-4ba5-93f0-e3ff499f4488 \n", - "3 NGEE Arctic Council Site, Mile Marker 71, Alaska \n", - "4 Hot spring microbial communities from Yellowst... \n", - "\n", - " uri ber_data_source \\\n", - "0 https://api.microbiomedata.org/biosamples/nmdc... NMDC \n", - "1 https://sc-data.emsl.pnnl.gov/monet MONET \n", - "2 https://sc-data.emsl.pnnl.gov/?projectId=61815 EMSL \n", - "3 https://data.ess-dive.lbl.gov/view/doi:10.1548... ESS-DIVE \n", - "4 https://gold.jgi.doe.gov/biosample?id=Gb0051341 JGI \n", - "\n", - " description entity_types \\\n", - "0 MONet sample represented in NMDC sample \n", - "1 None sample \n", - "2 Clostridium thermocellum protein extracts sample \n", - "3 Maps of land surface phenology derived from Pl... unspecified \n", - "4 Small acidic pool on hillside north of Nymph L... jgi_biosample \n", - "\n", - " latitude longitude elevation elevation_unit depth depth_unit \\\n", - "0 28.125842 -81.434174 24.000 m None m \n", - "1 68.633578 -149.632826 722.613 unknown None None \n", - "2 34.000000 118.000000 NaN None None None \n", - "3 64.847286 -163.719936 NaN None None None \n", - "4 44.752321 -110.725393 2280.000 meter (UO:0000008) None None \n", - "\n", - " alt_ids_count alt_names_count collections_count \n", - "0 0 0 0 \n", - "1 0 0 0 \n", - "2 0 0 0 \n", - "3 1 0 0 \n", - "4 1 2 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def entities_to_dataframe(entities: List[Entity]) -> pd.DataFrame:\n", - " \"\"\"\n", - " Convert a list of pydantic Entity objects to a pandas DataFrame.\n", - " \"\"\"\n", - " if not entities:\n", - " return pd.DataFrame()\n", - " \n", - " data = []\n", - " for entity in entities:\n", - " # Extract basic entity information\n", - " row = {\n", - " 'id': entity.id,\n", - " 'name': entity.name,\n", - " 'uri': entity.uri,\n", - " 'ber_data_source': entity.ber_data_source,\n", - " 'description': entity.description,\n", - " 'entity_types': ', '.join(entity.entity_type) if entity.entity_type else None,\n", - " }\n", - " \n", - " # Extract coordinate information\n", - " if entity.coordinates:\n", - " row.update({\n", - " 'latitude': entity.coordinates.latitude,\n", - " 'longitude': entity.coordinates.longitude,\n", - " 'elevation': entity.coordinates.elevation.has_numeric_value if entity.coordinates.elevation else None,\n", - " 'elevation_unit': entity.coordinates.elevation.has_unit if entity.coordinates.elevation else None,\n", - " 'depth': entity.coordinates.depth.has_numeric_value if entity.coordinates.depth else None,\n", - " 'depth_unit': entity.coordinates.depth.has_unit if entity.coordinates.depth else None,\n", - " })\n", - " \n", - " # Add alternative IDs and names count\n", - " row.update({\n", - " 'alt_ids_count': len(entity.alt_ids) if entity.alt_ids else 0,\n", - " 'alt_names_count': len(entity.alt_names) if entity.alt_names else 0,\n", - " 'collections_count': len(entity.part_of_collection) if entity.part_of_collection else 0,\n", - " })\n", - " \n", - " data.append(row)\n", - " \n", - " return pd.DataFrame(data)\n", - "\n", - "# Convert all entities to DataFrame\n", - "entities_df = entities_to_dataframe(all_entities_response.entities)\n", - "\n", - "print(f\"📊 DataFrame shape: {entities_df.shape}\")\n", - "print(f\"📋 Columns: {list(entities_df.columns)}\")\n", - "print(\"\\n🔍 First few rows:\")\n", - "display(entities_df.head())" - ] - }, - { - "cell_type": "markdown", - "id": "f40186ac", - "metadata": {}, - "source": [ - "## 5. Data Analysis and Visualization\n", - "\n", - "Let's analyze the data we've retrieved and create some visualizations." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "8db65513", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📊 DATASET OVERVIEW\n", - "==================================================\n", - "Total entities: 5\n", - "Data sources: 5\n", - "Unique entity types: 3\n", - "\n", - "📍 GEOGRAPHIC DISTRIBUTION\n", - "==================================================\n", - "Latitude range: 28.1258 to 68.6336\n", - "Longitude range: -163.7199 to 118.0000\n", - "\n", - "🏷️ DATA SOURCES\n", - "==================================================\n", - " NMDC: 1 entities\n", - " MONET: 1 entities\n", - " EMSL: 1 entities\n", - " ESS-DIVE: 1 entities\n", - " JGI: 1 entities\n", - "\n", - "🔖 ENTITY TYPES\n", - "==================================================\n", - " sample: 3 entities\n", - " unspecified: 1 entities\n", - " jgi_biosample: 1 entities\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Basic statistics about the data\n", - "print(\"📊 DATASET OVERVIEW\")\n", - "print(\"=\" * 50)\n", - "print(f\"Total entities: {len(entities_df)}\")\n", - "print(f\"Data sources: {entities_df['ber_data_source'].nunique()}\")\n", - "print(f\"Unique entity types: {entities_df['entity_types'].nunique()}\")\n", - "\n", - "print(\"\\n📍 GEOGRAPHIC DISTRIBUTION\")\n", - "print(\"=\" * 50)\n", - "print(f\"Latitude range: {entities_df['latitude'].min():.4f} to {entities_df['latitude'].max():.4f}\")\n", - "print(f\"Longitude range: {entities_df['longitude'].min():.4f} to {entities_df['longitude'].max():.4f}\")\n", - "\n", - "print(\"\\n🏷️ DATA SOURCES\")\n", - "print(\"=\" * 50)\n", - "source_counts = entities_df['ber_data_source'].value_counts()\n", - "for source, count in source_counts.items():\n", - " print(f\" {source}: {count} entities\")\n", - "\n", - "print(\"\\n🔖 ENTITY TYPES\")\n", - "print(\"=\" * 50)\n", - "type_counts = entities_df['entity_types'].value_counts()\n", - "for entity_type, count in type_counts.items():\n", - " print(f\" {entity_type}: {count} entities\")\n", - "\n", - "# Create visualizations\n", - "fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n", - "\n", - "# 1. Data sources pie chart\n", - "axes[0, 0].pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%', startangle=90)\n", - "axes[0, 0].set_title('Distribution by Data Source')\n", - "\n", - "# 2. Entity types bar chart\n", - "type_counts.plot(kind='bar', ax=axes[0, 1], color='lightblue')\n", - "axes[0, 1].set_title('Entity Types Distribution')\n", - "axes[0, 1].set_xlabel('Entity Type')\n", - "axes[0, 1].set_ylabel('Count')\n", - "axes[0, 1].tick_params(axis='x', rotation=45)\n", - "\n", - "# 3. Geographic scatter plot\n", - "scatter = axes[1, 0].scatter(entities_df['longitude'], entities_df['latitude'], \n", - " c=pd.Categorical(entities_df['ber_data_source']).codes, \n", - " alpha=0.7, s=100)\n", - "axes[1, 0].set_title('Geographic Distribution of Entities')\n", - "axes[1, 0].set_xlabel('Longitude')\n", - "axes[1, 0].set_ylabel('Latitude')\n", - "axes[1, 0].grid(True, alpha=0.3)\n", - "\n", - "# 4. Data summary table\n", - "axes[1, 1].axis('tight')\n", - "axes[1, 1].axis('off')\n", - "summary_data = [\n", - " ['Total Entities', len(entities_df)],\n", - " ['Data Sources', entities_df['ber_data_source'].nunique()],\n", - " ['Entity Types', entities_df['entity_types'].nunique()],\n", - " ['Avg Latitude', f\"{entities_df['latitude'].mean():.4f}\"],\n", - " ['Avg Longitude', f\"{entities_df['longitude'].mean():.4f}\"],\n", - "]\n", - "table = axes[1, 1].table(cellText=summary_data, \n", - " colLabels=['Metric', 'Value'],\n", - " cellLoc='center', loc='center')\n", - "table.auto_set_font_size(False)\n", - "table.set_fontsize(10)\n", - "table.scale(1.2, 1.5)\n", - "axes[1, 1].set_title('Summary Statistics')\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "b1650cb1", - "metadata": {}, - "source": [ - "## 6. Geospatial Queries\n", - "\n", - "Let's demonstrate the geospatial query capabilities of the BERtron client." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ba0ad16c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🌍 GEOSPATIAL QUERY EXAMPLES\n", - "==================================================\n", - "\n", - "🔍 Searching for entities within 100km of Orlando, FL\n", - " Center coordinates: 28.5383, -81.3792\n", - " Found: 1 entities\n", - " Query type: geospatial_nearby\n", - " Metadata: {'center': {'latitude': 28.5383, 'longitude': -81.3792}, 'radius_meters': 100000}\n", - "\n", - "📍 Nearby entities:\n", - " 1. DSNY_CoreB_TOP\n", - " Location: 28.1258, -81.4342\n", - " Source: NMDC\n", - "\n", - "📦 BOUNDING BOX QUERY\n", - "==============================\n", - "Searching within bounding box:\n", - " Southwest: 25.0, -85.0\n", - " Northeast: 31.0, -80.0\n", - " Found: 1 entities\n", - " Query type: geospatial_bounding_box\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAK9CAYAAADxDSf7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjMsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvZiW1igAAAAlwSFlzAAAPYQAAD2EBqD+naQAAW6NJREFUeJzt3QeYXFX5B+Cz6ZCQAoQUSOiEXqUE6VIF/jQFESQ0EWlC6CpCEAELRaQJQkABUURQQEAMTZHeUQzF0NMoaZCe+T/fibPcTd0ku5ndnfd9nmF37p2dPXPPTLi/Ped8t6ZUKpUSAAAAWauZXwAAAAhCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhJAC1VTU5POOeec2vs33HBD3vbWW29VtF3w8MMP5/difAVoioQkgDkoB4ribbnllkvbb799uvfeeyvdvGYnwlrxWLZq1Sr16tUr7bHHHumJJ56oWLv+9a9/pYMPPjgtv/zyqX379ql37975/r///e/UnB166KF1jne8tjXWWCP94Ac/SJMmTUpN0S233JIuvfTSSjcDIGsz8wsAc3LuueemlVdeOZVKpTRy5Mgcnr785S+nu+66K5/gNyff+MY30te+9rV8wlwpV111VerUqVOaMWNGevfdd9O1116bttlmm/TUU0+lDTfccLG25Y9//GM68MAD09JLL52OOOKI3M8xynbdddelP/zhD+l3v/td2muvvVJzFf38q1/9Kn8/duzY9Kc//Sn98Ic/TG+++Wa6+eabU1MMSa+88ko68cQTK90UACEJYF5222239IUvfKH2fpxM9+jRI/32t79tdiGpdevW+VZJX/nKV9Kyyy5be3/vvfdO6667brrtttsWa0iKoBChcZVVVkmPPvpo6t69e+2+73znO2nrrbfOI0ovvfRSDk+L02effZaWXHLJRX6eNm3a5NdQdswxx6Qtt9wyv3cvvvji/D4GYM5MtwNYAF27dk1LLLFEPgEt+vTTT9PJJ5+c+vTpk/+C369fv/Szn/0sj0CVxShFTH2K0aj5rR8qT09744038tSp+L1dunRJhx12WD6JLpo8eXI66aST8on+Ukstlf7v//4vvffee7P9jjmtSVpppZVy2PvHP/6RNttss9ShQ4ccHH7961/P9vMRGLbddtv8+ldYYYV03nnnpcGDBy/SOqeePXvmr7Mez1GjRtUG0mjTBhtskG688cY6++P1brfddnWOcRyvjh07pgMOOGCev/enP/1pPo7XXHNNnYAUIsT98pe/TBMmTMiPK4t+iOM1q3Jfzeqmm25Km2yyST5eMVoVo3gxelYU7Y+Q+Oyzz+YRtQhH3/3ud9OAAQNyO6ZOnTrb8+688875/bWgoo1bbbVVPl7//e9/6+yLKaQRDOPYxXto9913z1MRi0aMGJHff9H38R6P6ZIx0lbs+1nfx2Vx3OL4zU0ch3vuuSe9/fbbtVMEi8f6F7/4RVpnnXXy8enWrVv+w0WMPAE0FiNJAPMQ05Q+/PDDfGIZJ+ZxshYnz8W/0Me+CCYPPfRQPrGPEZH7778/nXrqqen9999Pl1xyyUL//v333z+PZFxwwQXpueeey9OnYm3Uj3/849rHHHnkkfmE/Otf/3oeKXjwwQfzSW59RbCIEZ5oe5ycX3/99fmENk7w48Q0xOuI9Vhx8nrmmWfmk+loy4JO3fv444/z15huF88Z078iBMXrLJs4cWI+aY52HXfccfn1x0hTtGnMmDF5pCeOQUzd++pXv5r75IQTTsjPGY+Jk/wrr7xynu2I6ZJxEh7BYE4isMT+eNz8nmtOfvSjH6Wzzjorv67on9GjR+d2xvM+//zzOfSWffTRR3nEMkJUvK8iGMbxjaAa76PiiGUElejfs88+Oy2McqCJoFH2m9/8Jvf7Lrvskt9XER7j2EagiraWw8p+++2Xg9Pxxx+ft8Xn4YEHHkjvvPPOHMPjgvje976XP2sR7sufl5iWGWJKZvRvvEej72NNVQT2J598Mr/nARpFCYDZDB48OIYnZru1b9++dMMNN9R57J133pn3nXfeeXW2f+UrXynV1NSU3njjjXx/2LBh+XHx3LOK7WeffXbt/fg+th1++OF1HrfPPvuUlllmmdr7L7zwQn7cMcccU+dxX//612d7zvJrinaUrbjiinnbo48+Wrtt1KhR+XWefPLJtduOP/74/Fqef/752m0fffRRaemll57tOeek/HpmvXXt2rV033331XnspZdemvfddNNNtdumTJlS6t+/f6lTp06lcePG1W4/8MADS0suuWTptddeK/30pz/NPxf9MS9jxozJj9trr73m+bj/+7//y48r/74BAwbk4zW311b21ltvlVq3bl360Y9+VOdxL7/8cqlNmzZ1tm+77bb5Z6+++uo6j50+fXpphRVWKB1wwAF1tl988cW5H/773//Os+3R1o4dO5ZGjx6db/Ee/NnPfpZ/dt111y3NmDEjP278+PG5D775zW/W+fkRI0aUunTpUrv9k08+ye2MYzwvs77nyuK4RZvKHnroofzY+Fq2++67z/H4Rj+ts8468/y9AA3NdDuAebjiiivyX8vjFqM1MZoSIwOx6L/sL3/5S17rE3/tLorpd3HeuCjV8I4++ug692PkI0Yexo0bV/u7w6y/e0EWv6+99tp1RlRi+llM5ypOybrvvvtS//7966wbiilkBx100AK9nttvvz0fy7/+9a95ql5UXIsRin/+85+1j4nXFNPwoqhCWdu2bfNrjFG8Rx55pHb75ZdfnqchxihDjNzEOqP5FVsYP358/hojTvNS3l9+fH3FeyNGtWIUKUYhy7d4TauvvnoecSyK0biYxlYU1f/i2P75z3+u8/uj4EKMFtZnnVRMAY2+jNtqq62WTjnllPTFL34xF3AoTw+MvojRuTjWxbbG+3nzzTevbWtMGWzXrl0u2f3JJ5+kxSlG3WKE6emnn16svxeobqbbAcxDrNMpFm6Ik8mNNtooTwOLaVBx4hjrKKJ09Kwn3WuttVb+GvsXVt++fevcL0+TihPVzp075+eOE+pVV121zuMWZM3KrL+j/HuKJ8PxeyIkzSpOvhdETDcrFm6IcBPBIaZwxbqc8u+KbfG65nc8I6hddtlledpdTFOL7+envuEn9keYKLa3Pl5//fUcjuM1zEkEvqIoPx7vo1kdcsghefrbHXfckb8fOnRoPkZXX311vdoR0xhjumCIkPGTn/wkT5GLwFNsa9hhhx3m+BzxHisHuWhLBP84zltssUV+/0e7yuvKGsvpp5+e/va3v+XPYrzfYk1WTLOLwAfQWIQkgAUQJ+4xmvTzn/88n2CW1+zUx5wW94fp06fP9WfmVo2uWKxgUS2O3zE3se4kRixidCNGPmItzoKKdTshQl2EgeJ6nzmJkacItbGuZV5ifxQpKAeY+vZfjCLFY2MEcU7HtrzWpqwYWmYd4Yt1YTGCGWEkvkZbiuu35iV+94477lh7P9Ycrbnmmulb3/pWHqEqt7W8LmlOYadYUCNGJ/fcc89055135mMeI3exVi7WSMUfDuZlXu/x+YlwHAHx7rvvziOaMRoZ68Timk+DBg1a6OcFmBfT7QAW0LRp0/LXmPoVVlxxxfTBBx/MNjLxn//8p3Z/cRQopjcVLcpIUzx3nOhGSeuiOKlsSPF7opDCrOa0rSGOZwTQ8gn83I5niJPmKCBx2mmn5WllUYCg/HzzEif7w4YNy1X95uTvf/97LnIQI1Rl0X+z9t2c+i9G9SJgxpS4CCmz3mIUpr4iHEUIGT58eK7mFgU5ikUXFkRUo4sqiDG6VL6Ab3kEMgphzKmtUUBj1tcWo0kxXTKuaTRlypR00UUXzfMYxWOi/fMztxAayhULY4pmFIqI4xDFMZrqhXGB5k9IAlgAUZI5ThDjL/rl6V9xcdn4S3msjymKKl1x4heVy8pTl2LqVlyXp2hhqqeVlZ971mlml156aWpIMQrx+OOPpxdeeKFOpbpFvShpPEesR4pRjDhRLx/PqOIWF3Mti+AT1eFiFCbKkIc4GY/1YTEN6/zzz89hKSoAxvfzE+tzopx0jKrEGq9Z2xRrwaK/YlplMSBEBbbiCFSc/Md0uKJ99903j+LEKMeso3Fxf9bfNy8xvTPeQ1HVLdaIFasqLoyY1hiv+8ILL6zt13idcczmVG48qvKFqHg3ayCJ4xFTF6MEfXHbrO/vKLNen5GkCEJxfGc16/GKz16MssWxnFObARqC6XYA8xBTpsojGLGeI/6aH6McZ5xxRu16jRiViCl4UcY4Rh/imj4RpGIKWUxRKq4XipP6OEGNr7HWKU4oX3vttYVuXxRSiBPpCFpxghmL+ocMGdIgIzxFMVIT07122mmnfKJdLgEe65kiVMxrFKDoD3/4Qw46cYIbo2/XXXddniYX62zKz3HUUUfl6xRFOe9YgxPlpePnHnvssRz+ymuKIjjECXSsV4lQsuuuu+bjGtdviuIN0Q9zE2tbosR2HLv11lsvlz+PkZ/ov3Kbbr311joFEqJEd6yP2WeffXIRiXKp7Cg+EeGsLPo72hCl0uP54oK50eYYuYpAFa8vQlp9xOhYvK4ogR7TCBektPucLLPMMrlIRLxfXn311Rz04zVEwYuNN944v8b4nTFaE9ctinU/Ef7jPfqlL30pT/WLgBLT8OK1jBw5Mv9MWRz/CJhRjCPeKy+++GKemlefdV0xtTCC8cCBA9Omm26a3yfx2Yo1SBGioy2xHiraHW2KYzG/4hsAC63B6+UBtNAS4B06dChtuOGGpauuuqq2hHJZlFI+6aSTSr179y61bdu2tPrqq+dyybM+7rPPPisdccQRubzyUkstVdp///1zye25lQCP8s1zalex5PbEiRNLJ5xwQi4NHmWf99xzz9K7775b7xLgUXp5VlGaOm5FUf576623zuXBozz1BRdcULrsssvyc0bJ6AUtAR5tjbLev//972d7/MiRI0uHHXZYadllly21a9eutN5669Upnf6nP/0pP8dFF11U5+eiXHe8pg022CCXDZ+fKMsd5dJ79uxZatWqVW0//+tf/5rj4//617/mEtrRpn79+uUy5bOWAC+7/fbbS1tttVV+nXFbc801S8cee2xp6NChtY+JYzy/8tZxfOL5jzrqqFJ9lUuAz8mbb76ZS5TPWpJ7l112ye/LeP2rrrpq6dBDDy0988wzef+HH36Y2x6vIZ43Hrf55pvP1ndRuvz000/P/Ral2eM5o/x4fUqAT5gwIfdFlCSPfeVy4L/85S9L22yzTX5/x3sv2nbqqaeWxo4dW+/jAbCgauI/Cx+xAKhmMVIWoz6xnmhuBSCakxhdihGsmNYW3zcFMSIZo1Ex6ji3i98C0LBMtwOgXiZOnFinEltMdYuqaFtttVWLCEjlQgmxziimU0Zlu/qsb2ps1157bVpllVXycQZg8TCSBEC91z9FtbNYxxJrUWLtTqwrijVQcf0jGlasiYoiEVFmO0rOz3rBYAAaj5AEQL1897vfzQUU4lpEUWQhFvqfffbZda7FQ8OJYxzFC6L0dRS2KF6zCIDGJSQBAAAUuE4SAABAgZAEAABQ0OInOM+YMSMvLI4LztX3YocAAEDLEyuNxo8fn3r37p1atWpVvSEpAlKfPn0q3QwAAKCJePfdd/OlHqo2JMUIUvlAdO7cudLNaTKja6NHj07du3efZ4Km5dDn1Um/Vx99Xn30efXR54tm3LhxeQClnBGqNiSVp9hFQBKSPv9wTZo0KR8PH67qoM+rk36vPvq8+ujz6qPPG8b8luE4sgAAAAVCEgAAQIGQBAAA0FTWJK200krp7bffnm37Mccck6644oo83/Lkk09Ot956a5o8eXLaZZdd0pVXXpl69OhRkfYCADSHEsfTpk1L06dPr3RTaASxJmnq1Kn5PNmapNm1bt06tWnTZpEv/VPRkPT000/X+QC/8soraaeddkpf/epX8/2TTjop3XPPPem2225LXbp0Sccdd1zad99902OPPVbBVgMANE1TpkxJw4cPT5999lmlm0IjhuAISnGtH9cAnbMll1wy9erVK7Vr1y41y5AUpQuLLrzwwrTqqqumbbfdNo0dOzZdd9116ZZbbkk77LBD3j948OC01lprpSeeeCJtscUWFWo1AEDTEyfOw4YNy39Jjwtlxgmik+iWO1LYEKMlLfHYTJkyJZdIj8/C6quvvtCjbU2mBHi8oJtuuikNHDgwd/izzz6bhxJ33HHH2sesueaaqW/fvunxxx+fa0iKaXlxK9ZCL//DETdmHovyXyGoDvq8Oun36qPPq7vP41wqZujEBTLjL+m0XHGO3LZt20o3o0nq0KFDDpCxpCcyQfv27evsr++/j00mJN15551pzJgx6dBDD833R4wYkf8C0rVr1zqPi/VIsW9uLrjggjRo0KDZtkeijLmbzHxzxEhd/KNqLmt10OfVSb9XH31e3X0eAan8R+EYaaBlKvd1MJI0Z+XPwYcffjhbmIxpis0qJMXUut122y0PDy+KM888M49GzXpV3Zja52KyM8WbJj5UrtRcPfR5ddLv1UefV3efx0hSnADGX9HjRstmJGnu4v0f/wYus8wyeWSpaNb7c32O1ATEcNjf/va39Mc//rF2W8+ePfOHPUaXiqNJI0eOzPvmJobUZh1WC3Gg/A/jc/EPqmNSXfR5ddLv1UefV3efx/flGy13JKncv/p5zsqfgTn9W1jffxubxL+gUZBhueWWS7vvvnvttk022SQn5CFDhtRuGzp0aHrnnXdS//79K9RSAAAWt4cffjif9MYfz8MNN9ww25KMxeWtt97KbXnhhRfm+bjtttsunXjiiYutXTSsVk1hmDhC0oABA+oMDUfJ7yOOOCJPnXvooYdyIYfDDjssBySV7QAAWpYozBWV+Yp/NF8UxVG14i2uv1lfsVZ+7733rrMtlnFEmfV11113jgGuLGZI/fCHP2yQ18LiV/HpdjHNLkaHDj/88Nn2XXLJJXlIbL/99qtzMVkAAFqWWJ9+/PHH568ffPDBIq9TD/GH+F133bXOtkUdgYogN6+lH2VLL730Iv0eqnwkaeedd85zK9dYY43Z9sXCqiuuuCJ9/PHH6dNPP82JvD5vSgAAFm6GT5xzjRo1KgeV+Br3G7us/IQJE9Lvfve79O1vfzuPJMV0uoYQgSjOHYu38sL98pS9+++/P1+Hs1OnTjlQxShROOecc9KNN96Y/vSnP9WOQsWoUXG6XXy//fbb58d369Ytby9Xap51ul38wf+UU05Jyy+/fOrYsWPafPPN8/MV1+jvueee+Xli/zrrrJP+8pe/NMhxoBmGJAAAKi+C0CeffJI++uijfNmUuB9f435sb8yg9Pvf/z5fD7Nfv37p4IMPTtdff33+I3pj++yzz9LPfvaz9Jvf/CY9+uijeXZTBJkQX/fff//a4BS3Lbfccrapd7fffnvt2vl4zM9//vM5/q7jjjsuTymM6X4vvfRS+upXv5qf+/XXX8/7jz322Bykoh0vv/xy+vGPf5yDG1U63Q4AgMqbOHFiHjWKNeIxpawsrskT22MEJkY4GkNMsYtwFCI4xLWfHnnkkTwasygOPPDAOq8l/Pvf/059+/atvSjr1VdfnVZdddXaIHPuuefm7yOgLLHEEjm4zG0mUzx3eVpdFCGb21S+CF8x9S++lqcRRgi777778vbzzz8/74slJuutt17ev8oqqyzSa2fRCEkAAOQgFGYNFXG/HJQaIyTFCMxTTz2V7rjjjnw/QtoBBxyQg9OihqRY377jjjvW2VZc67TkkkvWBqTQq1evPMWwocXIUBzDWZeXRACLa/mEE044IU83/Otf/5rbHIFp/fXXb/C2UD9CEgAAadq0aXO9hkystYn9jSHCUDx3MbzEVLu47uXll1+eKx4vrBgBWm211ep9QdZ4nY0xzS/WXEXYjGrNs4bQ8pS6I488Mhcpu+eee3JQuuCCC9JFF12Ui1mw+FmTBABAHsGZ27qjCA7FS7U0lAhHv/71r3MYiEII5duLL76YQ9Nvf/vbVEnt2rXLI0Dze0yY1+M22mijvD9GqSK0FW/FqXyxxunoo4/OxcpOPvnkdO211zbgq2FBGEkCACBPpYtCDXEyP+uapAhJjTHV7u67785FIeLamLOOGMV0sxhlitCwsOLaRSNGjKizbamllqr3a1lppZVy9buYEhjT4uY0qrXiiivmEah4LV/+8pfzOqZZCy7ENLuDDjooHXLIITkQRmgaPXp0GjJkSJ5SFxX9ohLebrvtlh8bxySuExpV96gMI0kAAOST+wgPMbozZcqUXNQgvsb92B77G1qEoFh/M6fwESHpmWeeyZXgFtZhhx2W1xkVb7/4xS/q/fPf/OY3c8W9L3zhC6l79+7psccem+0xUdJ70KBB6Ywzzkg9evTIxR/mJAo0REiKEaJ4zrhI7dNPP11bRCLCaFS4i2AUxSsiLLk+aOXUlBZHfcUKGjduXP7gRZWUzp07p2pXvv7B+++/n/+SEnNxy//wzW0eMi2j32OIPyrv6Ofqod+rjz6v7j6PQDNs2LC08sor114LaGGer1zlLsJRTLFzntC0xKl7uW9iBIvZxYjo3D4L9c0GpttV4fUPYvFg/HWofP2DuMU/gHHxMv8AAkD1ivOAOCdorFLf0Fw4I67S6x/ECFLconJM3I/tsR8AAKqdkFRF5nX9gxiuLe8HAIBqJiRVkUpd/wAAAJoTIamKVOL6BwAA0NwISVWkvAhz1oudNeb1DwAAoLkxdFCF1z8oV7eLWygHpMa4/gEAADQ3QlIVifVIUea7Xbt2uZJd3HedJAAAqEtIqtLrH3Tt2tXFBgGA2ZVKKY0dm9Jnn6W05JIpdekSFZ4q3SpYrJwhAwCQ0vjxKf32tyntuWdKm26a0lZbzfwa92N77G8iHn744VyZd8yYMfN83EorrZQuvfTSue5/66238vO88MILqSU755xz0oYbblh7/9BDD0177713RdvU1AlJAADV7vHHU9phh5QGDkzpuedmjhy1bz/za9yP7bE/HteArr766rTUUkvVuQxJrJ2O5QDbbbfdHIPRm2++mbbccss0fPjw1CVGuVJKN9xwQ54ls6D69OmTn2fdddddpNcR7Srfolpw375908CBA9PkyZNTU/Tzn/88H7PGduihh9Y5Nssss0zadddd00svvZSaOiEJAKCaRfA5/PCU3nsvpR49UurdO6XOnaMs7syvcT+2x/54XAMGpe233z6HomeeeaZ229///vfUs2fP9OSTT6ZJkybVbn/ooYdy+Fh11VXz+up4TJx4L4rWrVvn52mIy6AMHjw4B65hw4alK6+8Mv3mN79J5513XmqKIlwuTKhcGBGK4rjEbciQIflY77HHHqmpE5IAAKpVTKE74YSUYtpahKG5hYXYHvvjcfH4Bpp6169fv9SrV688SlQW3++1115p5ZVXTk888USd7RGqZp1uF98fdthhaezYsbUjFjG9rOyzzz5Lhx9+eB6xipB1zTXXzHW6Xfl542T+C1/4QlpyySXzqNXQoUPn+1oidETgitGpCAHxGp6LUbiCq666qjbkxWuPIDW3toR4fbGtfHzia6wnf/DBB9Omm2461/ZdeOGFqUePHvk1H3HEEXXC5pym28Wo3QknnJBOO+20tPTSS+fXUTyG4T//+U/aaqutUocOHdLaa6+d/va3v+W23XnnnfM8Lu3bt8/PF7eY8nfGGWekd999N40ePbr2MS+//HLaYYcdciGxGG066qijcnguv+Y4XhGey37yk5/ktfUjR45MjUVIAgCoVnff/fkI0vxGZWJ/PO7991O6554Ga0IEnxglKovv46R92223rd0eVXljZKkckooiJMS6o86dO9eOWJxyyim1+y+66KIceJ5//vl0zDHHpG9/+9vzDT3f+9738s/FCFeMfETIWhCvvfZaDjKbb7557bY77rgjfec730knn3xyeuWVV9K3vvWtHO6Kr72+fvCDH6Sf/exnc2zf73//+xxwzj///Lw/QmiMbM3PjTfemIt7xXH+yU9+ks4999z0wAMP1F5TM0JVhLLYH0EzjtGCiuBz0003pdVWWy2HofDpp5+mXXbZJVdgfvrpp9Ntt92WA9hxxx2X98d74cQTT0zf+MY3chCOfjzrrLPSr371qxwEG02phRs7dmwpXmZ8Zabp06eXhg8fnr9SHfR5ddLv1UefV3efT5w4sfTvf/87f62XGTNKpd13L5V69SqVNtmk/rd4fPxc/HwDuPbaa0sdO3YsTZ06tTRu3LhSmzZtSqNGjSrdcsstpW222SY/ZsiQIfl87u233873H3rooXz/k08+yfcHDx5c6tKly2zPveKKK5YOPvjgwkueUVpuueVKV111Vb4/bNiw/DzPP/98nef929/+Vvsz99xzT942r+Ma+zt06JBfR/v27fP9PfbYozRlypTax2y55Zalb37zm3V+7qtf/Wrpy1/+8hzbEuL1xbZoV7F99913X34tc2pf//79S8ccc0yd37P55puXNthgg9r7AwYMKO21116197fddtvSVlttVednNt1009Lpp5+ev7/33ntzv8R7reyBBx7Iv/eOO+6Y63GJ39O6det8XOIWj+/Vq1fp2WefrX3MNddcU+rWrVtpwoQJdY55q1atSiNGjMj3J0+eXNpwww1L+++/f2nttdee7TjOal6fhfpmAyNJAADVKMp8x4hKp04L9nPx+Pi5ceMapBkxUhCjCTGKEFOq1lhjjdS9e/c8klRelxRTrlZZZZU8XW5Brb/++rXfx/SwmPY1atSoev9MjMSE+f3MJZdckqfKvfjii+nuu+/Oo0kx+lH26quvpi9+8Yt1fibux/YFtd566821ffF8xRGs0L9///k+Z/E1l5+3/Jwx8hbTCOPYlW222WapPmL0L45L3J566qk8arTbbrult99+u7a9G2ywQR7FKh6XGTNm1I74xXS7m2++Od1+++35/RDHurG5ThIAQDWK6yBNnz6zit2CaN06pSlTYp7UzGsoLaKYerXCCivkaWeffPJJDkehd+/e+cT8n//8Z94Xa1YWRlTKK4qgFCfg9f2ZcnGI+f1MBIh4LSHWG40fPz4deOCBuXhDefu8lK9dOXNgaqapU6c2WPsa4zjVR4Sf4uuPaXJROOLaa69doMIW8T4IH3/8cb4VQ1VjMJIEAFCN4kKxEXgiKC2IeHyc0DfgSWqMNsRoUdyKpb+32WabdO+99+YRiDmtRyqLkYZYN9OUROW88nqqsNZaa6XHHnuszmPifhRBCDF6FmJNVdnCXL8pfk+MwBUVC2AsjH79+uViC8VCCTHytzAifEUgLB6XGH2L0cTicYnHxO8NUfb9pJNOysEqRskGDBjQIAFuXoQkAIBqFKNAcRL6vypi9RaPj5+L8uANJALQP/7xjxwKyiNJIb7/5S9/maZMmTLPkBQXjY2iAFGV7sMPP8wV7Ra3qEQ3YsSI9MEHH6RHHnkkFz6IqYMRAsKpp56ar00UFe5ef/31dPHFF6c//vGPtUUmorLbFltskSvTxRS0eI7vf//7C9yOKA5x/fXX55LkMeXv7LPPTv/6178W6bXttNNOuSpfhJO4xlGEmHLb5leGPa4VFcclbvG6jj/++NxXe8ZFilNKBx10UK6YF88dBS1i1DAeE1MVozBDhN+DDz44T9OLQhfxuqINUVijMQlJAADVKE5uDzoo5nelVLiY6zyVH3fwwfOvhrcAIgDFyEJMyypWLIuQFNPWyqXC5yYq3B199NHpgAMOyCMyUZ1tcYsT+GhjTB2MaXbrrLNOHgUrX4MpqsPFRVyjKl3si/AXJ/zFkbMIN3Fh3U022SRXdFuY6yzFMYjqb1HOO54n1v5ERb9FHRW78847c7iJ0uNHHnlkbXW7CDjzct999+XjErcYBSpXsCu/7qiYd//99+cpdPHcX/nKV9KXvvSldPnll+f9P/rRj/JriOMV4nmiul6EtBiBaiw1Ub0htWDjxo3L8x6jZGCUhmTmnNVYiBf15cvzX2nZ9Hl10u/VR59Xd5/HaEtcyDSuLzS/E9dacb2jWOsTZcDjOkjzCj5xyvjBBymtsEJKDz6Y0lJLNdjroP7i1D2CVISvRb2Y7qJ47LHH8nWT3njjjTzK1JREcYe5fRbqmw0UbgAAqFYRdC67LKW4zk4EoBjFmdMFZWMEKdajdO2a0i9+ISBVoTvuuCN16tQprb766jkYxbS+qELX1AJSQxGSAACqWZSHvv76lE44YeaFYstlvstFHcprlmIEKQLSFltUtLlUxvjx49Ppp5+e3nnnnbTsssumHXfcsdHXBVWSkAQAUO0iKMUUunvuSemmm2ZeBynKfMe0zY03nrkGaY89FvyaSrQYhxxySL5VCyEJAICZU+i+9rVY+T/zQrFRkjnKfMe6jQqufYFKEJIAAPhcBKIoD94AF4qF5krpGwAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACYi7feeivV1NSkF154Id9/+OGH8/0xY8ZUumk0IiEJAICKOfTQQ3PoKN+WWWaZtOuuu6aXXnopNUVbbrllGj58eOrSyNX/ymGsfFtiiSXSOuusk6655ppG/b3MJCQBAFBREYoieMRtyJAhqU2bNmmPuHhtE9SuXbvUs2fPHFwWh6FDh+bj8u9//zt961vfSsccc0x6MC78S6MSkgAAqKj27dvn4BG3DTfcMJ1xxhnp3XffTaNHj659zMsvv5x22GGHPKISo01HHXVUmjBhQu3+7bbbLp144ol1nnfvvffOI1VlK620Ujr//PPT4YcfnpZaaqnUt2/f2UZmnnrqqbTRRhulDh06pC984Qvp+eefr7N/1ul2N9xwQ+ratWu6//7701prrZU6depUG/rKpk2blk444YT8uGj76aefngYMGJDbNz/LLbdcPi4rr7xyfo74WmzT5MmT8/Z4XLR5q622Sk8//XTeN2nSpDz6FMeq7M0338yv/frrr5/v765mQhIAQEt38cUprbDC/G//93+z/2xsq8/Pxu9oABF8brrpprTaaqvlQBE+/fTTtMsuu6Ru3brlAHDbbbelv/3tb+m4445b4Oe/6KKLasNPjMp8+9vfzqM15d8dI1hrr712evbZZ9M555yTTjnllPk+52effZZ+9rOfpd/85jfp0UcfTe+8806dn/vxj3+cbr755jR48OD02GOPpXHjxqU777xzgdpdKpXSfffdl597s802q91+2mmnpdtvvz3deOON6bnnnsvHLY7Vxx9/nENT/N7Y96c//SlNnz49HXzwwWmnnXbKQZG5azOPfQAAtATjxqX0/vvzf1yfPrNvi9Gc+vxs/I6FdPfdd+cRmHIg6tWrV97WqtXMv+ffcssteVTk17/+derYsWPedvnll6c999wzB5AePXrU+3d9+ctfzuEoxIjOJZdckh566KHUr1+//HtmzJiRrrvuuhwwYhTmvffey0FqXqZOnZquvvrqtOqqq+b7Ed7OPffc2v2/+MUv0plnnpn22Wef2rb/5S9/qVd7V4gA+r8Ro2jboEGD0tZbb117rK666qo8mrXbbrvlbddee2164IEH8ms49dRT88jceeedl4488sj0ta99Lb399tv52DJvQhIAQEvXuXNKyy8//8d17z7nbfX52fgdC2n77bfPJ/vhk08+SVdeeWU+6Y+pbyuuuGJ69dVX0wYbbFAbkMIXv/jFHBpiFGhBQtL6669f+31Mm4upbKNGjcr34/fE/ghIZf3795/vcy655JK1ASlEyCs/59ixY9PIkSPrjP60bt06bbLJJrn98/P3v/89T4+LkBTHIwJYTNs79thj89S5CGhxLMratm2bf1e8lrKTTz45j1xFOLv33ntrR+iYOyEJAKClGzhw5m1h/PnPqbFF+IlpYmW/+tWvcvW4GBWJUZD6iFGnmJJWFAFiVhEiiiIo1SeszMucnnPWtiysWIMUoSjEyNYTTzyRLrzwwhyS6isC22uvvZbD2euvv57XTDFv1iQBANCkRMiI0DNx4sR8PwoivPjii3l6WVms7YnHxDS50L179zrFEmL9zSuvvLJAvzd+T5Qej6l9ZRFKFkWEvRjpKhdTKLct1g8tjAg65eMSo1dRbS+ORTEYxu+KdVVlsf5ovfXWy2uTYophcZSJOROSAACoqJhKNmLEiHyLE/jjjz8+F1GINUfhoIMOylPgoiJcBJ9YQxSP+cY3vlE71S4q391zzz359p///CevI1rQC75+/etfzwHtm9/8Zi65HeuGoiDDooq2XnDBBbl4QkwP/M53vpOnFdanjHiMAsVxibVEUbAiilqUj0uMwMXrjLVHUdQh2hxtj0ISRxxxRH7MFVdckR5//PEckOI4RkW9+DplypRFfl0tmel2AABUVJzgxzqeEOtv1lxzzRwIoqx3ec1PlNiOcLHpppvm+/vtt1+6uFBRL0ZLYrTpkEMOyddZOumkk/JapwURxSPuuuuudPTRR+cy4DEaE4Uh4nctihi9iaATbYuRoCjJHRXo4vv5KY+UxWvq06dP/tnvf//7tftj6l1MF4zAOH78+Fy5L45VVAKMsBgBKoo4xM+GWO8V667OOuus/NqYs5pSQ02YbKKixGIMc8aiuc6LsKCwJYkPUvxVIurpl6vG0LLp8+qk36uPPq/uPo+RgWHDhuU1LMXCAzTNfoupffvvv3/64Q9/uEA/G6fucd2lCE2L64K2zU1Ml5zbZ6G+2cBIEgAANKKYKvfXv/41bbvttnlqYVSZi5P4mN5H0+TPTAAA0IhiZDeuZRRTBaNc98svv5wvhhujSTRNRpIAAKARxXqgYgU6mj4jSQAAAAVCEgBAC9LCa3LBYvkMmG63mCqYxEW/4gJo5WokUdd+iSWWUH0IAGgQbdu2zV/jGjlxjgHV6rPPPqvzmVgYQtJiCEhxsbDyFaIjFEVZwrhFUIoa9oISALCo4po7Xbt2zSXBQ1xLSInolkcJ8HkfmwhI8RmIz0J9rkM1N0JSIyuPIMUbudhR06dPz9ujdnuEJQCARdWzZ8/8tRyUaJlBIP4IH39kF5LmLAJS+bOwsISkRlYeQZo1ycb9clASkgCAhhAnzb169coXl506dWqlm0MjiID00UcfpWWWWcZspDmIKXaLMoJUJiQ1shgOndsbOP4hi/0AAA0pThIb4kSRphmSIgjEbCQhqfE4so0sptnFm3luw6WxHwAAaDqEpEZWnkoXU+uK4n6EJFPtAACgaTGM0ciiBGcEoVh7FMEopthFOCoHJCU6AQCgaRGSGlnMFY0y3zFv1HWSAACg6ROSFoMIQhGKTK0DAICmzzAGAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAEBTCknvv/9+Ovjgg9MyyyyTllhiibTeeuulZ555pnZ/qVRKP/jBD1KvXr3y/h133DG9/vrrFW0zAADQclU0JH3yySfpi1/8Ymrbtm26995707///e900UUXpW7dutU+5ic/+Um67LLL0tVXX52efPLJ1LFjx7TLLrukSZMmVbLpAABAC9Wmkr/8xz/+cerTp08aPHhw7baVV165zijSpZdemr7//e+nvfbaK2/79a9/nXr06JHuvPPO9LWvfa0i7QYAAFquioakP//5z3lU6Ktf/Wp65JFH0vLLL5+OOeaY9M1vfjPvHzZsWBoxYkSeYlfWpUuXtPnmm6fHH398jiFp8uTJ+VY2bty4/HXGjBn5xsxjEQHU8age+rw66ffqo8+rjz6vPvp80dT3uFU0JP33v/9NV111VRo4cGD67ne/m55++ul0wgknpHbt2qUBAwbkgBRi5Kgo7pf3zeqCCy5IgwYNmm376NGjTdErvDnGjh2bP2CtWlV8WRqLgT6vTvq9+ujz6qPPq48+XzTjx49v+iEpOvkLX/hCOv/88/P9jTbaKL3yyit5/VGEpIVx5pln5tBVHEmKKX3du3dPnTt3brC2N2dx3GtqavIx8eGqDvq8Oun36qPPq48+rz76fNF06NCh6YekqFi39tpr19m21lprpdtvvz1/37Nnz/x15MiR+bFlcX/DDTec43O2b98+32YVbyJvpM/Fh8sxqS76vDrp9+qjz6uPPq8++nzh1feYVfTIRmW7oUOH1tn22muvpRVXXLG2iEMEpSFDhtQZGYoqd/3791/s7QUAAFq+io4knXTSSWnLLbfM0+3233//9NRTT6Vrrrkm38op+cQTT0znnXdeWn311XNoOuuss1Lv3r3T3nvvXcmmAwAALVRFQ9Kmm26a7rjjjryO6Nxzz80hKEp+H3TQQbWPOe2009Knn36ajjrqqDRmzJi01VZbpfvuu6/e8wkBAACaTUgKe+yxR77NTYwmRYCKGwAAQGOz2gsAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgqYSkc845J9XU1NS5rbnmmrX7J02alI499ti0zDLLpE6dOqX99tsvjRw5spJNBgAAWriKjySts846afjw4bW3f/zjH7X7TjrppHTXXXel2267LT3yyCPpgw8+SPvuu29F2wsAALRsbSregDZtUs+ePWfbPnbs2HTdddelW265Je2www552+DBg9Naa62VnnjiibTFFltUoLUAAEBLV/GQ9Prrr6fevXunDh06pP79+6cLLrgg9e3bNz377LNp6tSpaccdd6x9bEzFi32PP/74XEPS5MmT861s3Lhx+euMGTPyjZnHolQqOR5VRJ9XJ/1effR59dHn1UefL5r6HreKhqTNN9883XDDDalfv355qt2gQYPS1ltvnV555ZU0YsSI1K5du9S1a9c6P9OjR4+8b24iZMXzzGr06NF5jRMz3xwxUhcfsFatKj7jksVAn1cn/V599Hn10efVR58vmvHjxzf9kLTbbrvVfr/++uvn0LTiiium3//+92mJJZZYqOc888wz08CBA+uMJPXp0yd17949de7cuUHa3RI+XFEkI46JD1d10OfVSb9XH31effR59dHniyZmrzWL6XZFMWq0xhprpDfeeCPttNNOacqUKWnMmDF1RpOiut2c1jCVtW/fPt9mFW8ib6TPxYfLMaku+rw66ffqo8+rjz6vPvp84dX3mDWpIzthwoT05ptvpl69eqVNNtkktW3bNg0ZMqR2/9ChQ9M777yT1y4BAAA0hoqOJJ1yyilpzz33zFPsorz32WefnVq3bp0OPPDA1KVLl3TEEUfkqXNLL710nip3/PHH54Cksh0AANAiQ9J7772XA9FHH32U51VutdVWubx3fB8uueSSPCQWF5GNinW77LJLuvLKKyvZZAAAoIWraEi69dZb57uw6oorrsg3AACAxaFJrUkCAACoNCEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgIYISW+++Wb6/ve/nw488MA0atSovO3ee+9N//rXvxb2KQEAAJpnSHrkkUfSeuutl5588sn0xz/+MU2YMCFvf/HFF9PZZ5/d0G0EAABo2iHpjDPOSOedd1564IEHUrt27Wq377DDDumJJ55oyPYBAAA0/ZD08ssvp3322We27cstt1z68MMPG6JdAAAAzSckde3aNQ0fPny27c8//3xafvnlG6JdAAAAzSckfe1rX0unn356GjFiRKqpqUkzZsxIjz32WDrllFPSIYcc0vCtBAAAaMoh6fzzz09rrrlm6tOnTy7asPbaa6dtttkmbbnllrniHQAAQHPVZmF+KIo1XHvttemss85Kr7zySg5KG220UVp99dUbvoUAAABNPSSV9e3bN98AAACqLiQNHDiw3k968cUXL2x7AAAAmkdIisp1Rc8991yaNm1a6tevX77/2muvpdatW6dNNtmk4VsJAADQ1ELSQw89VGekaKmllko33nhj6tatW972ySefpMMOOyxtvfXWjdNSAACAplrd7qKLLkoXXHBBbUAK8f15552X9wEAAFRVSBo3blwaPXr0bNtj2/jx4xuiXQAAAM0nJO2zzz55at0f//jH9N577+Xb7bffno444oi07777NnwrAQAAmnIJ8Kuvvjqdcsop6etf/3qaOnXqzCdq0yaHpJ/+9KcN3UYAAICmHZKWXHLJdOWVV+ZA9Oabb+Ztq666aurYsWNDtw8AAKD5XEw2QtH666/fcK0BAABojiFp++23TzU1NXPd/+CDDy5KmwAAAJpXSNpwww3r3I91SS+88EJ65ZVX0oABAxqqbQAAAM0jJF1yySVz3H7OOeekCRMmLGqbAAAAmlcJ8Lk5+OCD0/XXX9+QTwkAANB8Q9Ljjz+eOnTo0JBPCQAA0PSn2816wdhSqZSGDx+ennnmmXTWWWc1VNsAAACaR0jq3Llznep2rVq1Sv369Uvnnntu2nnnnRuyfQAAAE0/JN1www0N3xIAAIDmuiZplVVWSR999NFs28eMGZP3AQAAVFVIeuutt9L06dNn2z558uT0/vvvN0S7AAAAmv50uz//+c+1399///2pS5cutfcjNA0ZMiSttNJKDdtCAACAphqS9t577/w1ijYMGDCgzr62bdvmgHTRRRc1bAsBAACaakiaMWNG/rryyiunp59+Oi277LKN1S4AAIDmU91u2LBhDd8SAACA5hSSLrvssnTUUUelDh065O/n5YQTTmiItgEAADTdkHTJJZekgw46KIek+H5uYr2SkAQAALT4kFScYme6HQAA0FIt1HWSzj333PTZZ5/Ntn3ixIl5HwAAQFWFpEGDBqUJEybMtj2CU+wDAACoqpBUKpXy2qNZvfjii2nppZduiHYBAAA0/RLg3bp1y+EobmussUadoDR9+vQ8unT00Uc3RjsBAACaXki69NJL8yjS4YcfnqfVdenSpXZfu3bt0korrZT69+/fGO0EAABoeiFpwIAB+evKK6+cttxyy9S2bdvGahcAAEDTD0ll2267be33kyZNSlOmTKmzv3PnzoveMgAAgOZSuCGq2B133HFpueWWSx07dsxrlYo3AACAqgpJp556anrwwQfTVVddldq3b59+9atf5TVKvXv3Tr/+9a8bvpUAAABNebrdXXfdlcPQdtttlw477LC09dZbp9VWWy2tuOKK6eabb04HHXRQw7cUAACgqY4kffzxx2mVVVapXX8U98NWW22VHn300YZtIQAAQFMPSRGQhg0blr9fc8010+9///vaEaZiWXAAAICqmG4XU+xefPHFXOXujDPOSHvuuWe6/PLL09SpU9PFF1/c8K2EFmbGjBlp4sSJ6dNPP03Tpk1Lbdq0yUVQllhiidSq1UL97QIAgEqGpJNOOqn2+x133DH95z//Sc8++2xadtll00033dRQbYMWG5A++eSTHJBChKIopR+3crVIQQkAoHIa5EwsCjbsu+++earddddd1xBPCS1WeQQpRo+iOmRclDm+xv3YHvsBAKgcf66Gxaw8gtS6des62+N+TU1N7X4AACpDSILFLNYgzW06XYSk2A8AQOUISbCYxbS6WJc0J6VSKe8HAKByFuhsLNYdzcuYMWMWtT3Q4kVxhijSMH369DpT7uJ+hKTYDwBAMwlJ87sGUuw/5JBDFrVN0KJFme8IQrH2KIJRTLGLcFQOSLEfAIBmEpIGDx7ceC2BKhHrkaLMd4cOHVwnCQCgCbL4ASogglCEIlPrAACaHn+yBgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAphiSLrzwwlRTU5NOPPHE2m2TJk1Kxx57bFpmmWVSp06d0n777ZdGjhxZ0XYCAAAtW5MISU8//XT65S9/mdZff/0620866aR01113pdtuuy098sgj6YMPPkj77rtvxdoJAAC0fBUPSRMmTEgHHXRQuvbaa1O3bt1qt48dOzZdd9116eKLL0477LBD2mSTTdLgwYPTP//5z/TEE09UtM0AAEDL1abSDYjpdLvvvnvacccd03nnnVe7/dlnn01Tp07N28vWXHPN1Ldv3/T444+nLbbYYo7PN3ny5HwrGzduXP46Y8aMfGPmsSiVSo5HFdHn1Um/Vx99Xn30efXR54umvsetoiHp1ltvTc8991yebjerESNGpHbt2qWuXbvW2d6jR4+8b24uuOCCNGjQoNm2jx49Oq9xYuabI0bq4gPWqlXFBxNZDPR5ddLv1UefVx99Xn30+aIZP3580w5J7777bvrOd76THnjggdShQ4cGe94zzzwzDRw4sM5IUp8+fVL37t1T586dG+z3NPcPVxTJiGPiw1Ud9Hl10u/VR59XH31effT5oqlv7qhYSIrpdKNGjUobb7xx7bbp06enRx99NF1++eXp/vvvT1OmTEljxoypM5oU1e169uw51+dt3759vs0q3kTeSJ+LD5djUl30eXXS79VHn1cffV599PnCq+8xq1hI+tKXvpRefvnlOtsOO+ywvO7o9NNPz6M/bdu2TUOGDMmlv8PQoUPTO++8k/r371+hVgMAAC1dxULSUkstldZdd9062zp27JiviVTefsQRR+Spc0svvXSeKnf88cfngDS3og0AAADNvrrdvFxyySV5SCxGkqJi3S677JKuvPLKSjcLAABowZpUSHr44YdnW1h1xRVX5BsAAMDiYLUXAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFAgJAEAABQISQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAAAAFQhIAAECBkAQAAFDQpngHAACgIcyYMSNNnDgxffrpp2natGmpTZs2qWPHjmmJJZZIrVo17bEaIQkAAGjwgPTJJ5/kgBQiFE2aNCnfIih169atSQclIQkAAGhQE/83ghSjR61bt67dPn369Ly9Q4cOOSw1VU03vgEAAM3Sp/8bQSoGpPL9mpqa2v1NlZAEAAA0qGnTps11Ol2EpNjflAlJAABAg2rTpk1elzQnpVIp72/KhCQAAKBBdfzfeqNYg1QU9yMkNeX1SKFpRzgAAKDZWWKJJXIQirVHEYxiil2Eo3JAiv1NmZAEAAA0qFatWuUy31HFznWSAAAA0sygFKGoqU+tm5OmHeEAAAAWMyEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAraFO8AVNzw4Sltuum8H7PUUin98IcpfeUri6tVAEAVEZKAyrnmmpQmTEipU6eUjjpq5rbp01N6//35/+xZZwlJAECjEJKAyjn33JmBaPnlPw9JrVvPvD835QA1fvziaSMAUHWEJKBp6dUrpffem/v+oUNTmjYtpTb++QIAGoezDKB56dev0i0AAFo41e0AAAAKhCQAAIAC0+2A5uWWW1L67LOUllwypa9/vdKtAQBaICEJaF5OO+3zinhCEgDQCEy3AwAAaCoh6aqrrkrrr79+6ty5c771798/3XvvvbX7J02alI499ti0zDLLpE6dOqX99tsvjRw5spJNBgAAWriKhqQVVlghXXjhhenZZ59NzzzzTNphhx3SXnvtlf71r3/l/SeddFK666670m233ZYeeeSR9MEHH6R99923kk0GGtIaa6S09tozvwIANBEVXZO055571rn/ox/9KI8uPfHEEzlAXXfddemWW27J4SkMHjw4rbXWWnn/FltsUaFWAw3mwQcr3QIAgKZbuGH69Ol5xOjTTz/N0+5idGnq1Klpxx13rH3Mmmuumfr27Zsef/zxuYakyZMn51vZuHHj8tcZM2bkGzOPRalUcjyqSEvq85r/3UpxawGvpzG1pH6nfvR59dHn1UefL5r6HreKh6SXX345h6JYfxTrju6444609tprpxdeeCG1a9cude3atc7je/TokUaMGDHX57vgggvSoEGDZts+evTo/DuY+eYYO3Zs/oC1aqV2RzVoSX3efcaM1Pp/r2n0qFGVbk6T1pL6nfrR59VHn1cffb5oxo8f3zxCUr9+/XIgis7+wx/+kAYMGJDXHy2sM888Mw0cOLDOSFKfPn1S9+7dc3EIZn64ampq8jHx4aoOLanPa/7X/ngdyy23XKWb06S1pH6nfvR59dHn1UefL5oOHTo0j5AUo0WrrbZa/n6TTTZJTz/9dPr5z3+eDjjggDRlypQ0ZsyYOqNJUd2uZ8+ec32+9u3b59us4k3kjfS5+HA5JtWlSfb5QQel9OGHKS27bEo337zg0+6a0mtpoppkv9Oo9Hn10efVR58vvPoes1ZNMR3HmqIITG3btk1Dhgyp3Td06ND0zjvv5Ol5QAsQo8Z//evMr/UVfySJC8nO448lAACLoqIjSTE1brfddsvFGGJ+YFSye/jhh9P999+funTpko444og8dW7ppZfOU+WOP/74HJBUtoMq9swzlW4BANDCVTQkjRo1Kh1yyCFp+PDhORTFhWUjIO200055/yWXXJKHxOIisjG6tMsuu6Qrr7yykk0GAABauIqGpLgO0vwWVl1xxRX5BgAAsDg0uTVJAAAAlVTx6nYAC+Rb30rp449TWnrplH75y0q3BgBogYQkoHm5556U3n9/ZoU7AIBGYLodAABAgZAEAABQYLodUDnf/GZKY8em1KVLpVsCAFBLSAIq5+yzK90CAIDZmG4HAABQICQBAAAUmG4HNF2l0sw1S599ltKSS1q7BAAsFkISUDkrrPD5NY/ee+/z7ePHp3T33SndfHNKQ4emNH16Sq1bp9SvX0oTJ1ayxQBAFRCSgKbl8cdTOuGEmaGppialTp1Sat9+ZlB67rmUJk1KadllU9p220q3FABooYQkoGkFpMMPT2nMmJR69EipzSz/RHXunNJyy6U0cuTMwBSP79+/Uq0FAFoohRuApiGm2MUIUgSk3r1TqU2bNH3GjDR12rQ0ZerU/DXux/bYnx8Xj4+fAwBoQEIS0DTEGqSYYtejRyrV1KTp06aladOmpRkzZuQCDvE17sf22J9HmmI90z33VLrlAEALIyQBTUMUaYjw06ZNDkQxalRTU5Na1dTU+Rrbc3AqT8W76aaZVfAAABqINUlA5UXIiSp2UaQhpZkhKKVUM8vD4n7b115LaerUlNq2Talv35k/N26c8uAAQIMxkgRUXoSicpnvnJlKswWkWtOnp5p4fNzi8fH1008XZ2sBgBZOSAIqr1WrmYEnglKMGNXUpHpNoIvHx8927NjYLQQAqoiQBFRerEWKC8VOmJDvtorgEyNKszxstuAUj4+fi9LgAAANxJokoHKi6MLkyTMvFjt8eErPPpvStGmpVZs2qXWrVrlIw2whKQJV0cEHz74NAGARCElA5Wy33effx/WOLr44lwGv6d07tW7TJq89iiIOeY1STU0OTnWm2i2/fEq7716RpgMALZfpdkDTsNRSKV12WUpdu6b0wQepZtq0HIratmmT2rVtm7/G/doxowhMv/jFzJ8DAGhAQhLQdPTvn9L116e0wgopjRqVw1Iu7x3V6+Jr3P9fcYfUrVtKW2xR6RYDAC2QkARUzsMPp3T//TO/FoPSgw/OnHq38cYzr6E0ZcrMr3E/RppCu3YVazYA0LJZkwRUThRdeP/9mWuL3nvv8+0xhe5rX0vpgAM+H0mKMt9Rxa5Pn0q2GACoAkIS0HRF1bouXWbeyq6+OqWJE1NaYolKtgwAaMGEJKB52WOPSrcAAGjhrEkCAAAoEJIAAAAKTLcDmpdnn51Z7S6q222ySaVbAwC0QEIS0LzstdecK+IBADQQ0+0AAAAKhCQAAIACIQkAAKDAmiSgcqwpAgCaICNJAAAABUISAABAgZAEAABQYE0SUDmDBqU0dmxKXbqkdPbZlW4NAEAmJAGVc+21n18YVkgCAJoIIQloXl59NaVSKaWamkq3BABooYQkoHlZaqlKtwAAaOEUbgAAACgQkgAAAApMtwOal4svTmncuJQ6d05p4MBKtwYAaIGEJKD5haRyRTwhCQBoBKbbAQAAFAhJAAAABabbAZWz7bYpffhhSssuW+mWAADUEpKAyrn55kq3AABgNqbbAQAAFAhJAAAABUISAABAgZAEVM4OO6S0zjozvwIANBEKNwCV89prMy8MO3Zs/X9m441T6tMnpe7dG7NlAEAVE5KAyougtMIK8w9Hf/7zzBsAQCMSkoDKGTgwpZNP/jwozUuMHgEALAZCElDZkBQuvnj+jzW9DgBYTIQkoPJBqRyWAACaANXtAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACAAiEJAACgQEgCAAAoEJIAAAAKhCQAAIACIQkAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKGiTWrhSqZS/jhs3rtJNaTJmzJiRxo8fnzp06JBatZKTq4E+r076vfro8+qjz6uPPl805UxQzghVG5LiTRT69OlT6aYAAABNJCN06dJlrvtrSvOLUS0gbX/wwQdpqaWWSjU1NZVuTpNJ0BEa33333dS5c+dKN4fFQJ9XJ/1effR59dHn1UefL5qIPhGQevfuPc+RuBY/khQvfoUVVqh0M5qk+GD5cFUXfV6d9Hv10efVR59XH32+8OY1glRmIiMAAECBkAQAAFAgJFWh9u3bp7PPPjt/pTro8+qk36uPPq8++rz66PPFo8UXbgAAAFgQRpIAAAAKhCQAAIACIQkAAKBASAIAACgQklq4H/3oR2nLLbdMSy65ZOratescH1NTUzPb7dZbb63zmIcffjhtvPHGuZLKaqutlm644YbF9ApojD5/55130u67754fs9xyy6VTTz01TZs2rc5j9HnztdJKK832mb7wwgvrPOall15KW2+9derQoUO+cvtPfvKTirWXhnHFFVfkvo8+3XzzzdNTTz1V6SbRQM4555zZPtNrrrlm7f5JkyalY489Ni2zzDKpU6dOab/99ksjR46saJtZMI8++mjac889U+/evXP/3nnnnXX2R521H/zgB6lXr15piSWWSDvuuGN6/fXX6zzm448/TgcddFC+wGz8//+II45IEyZMWMyvpOUQklq4KVOmpK9+9avp29/+9jwfN3jw4DR8+PDa29577127b9iwYfmEevvtt08vvPBCOvHEE9ORRx6Z7r///sXwCmjoPp8+fXruz3jcP//5z3TjjTfmABT/+Jbp8+bv3HPPrfOZPv7442v3jRs3Lu28885pxRVXTM8++2z66U9/mk/Crrnmmoq2mYX3u9/9Lg0cODCXBX7uuefSBhtskHbZZZc0atSoSjeNBrLOOuvU+Uz/4x//qN130kknpbvuuivddttt6ZFHHkkffPBB2nfffSvaXhbMp59+mj+38ceOOYk/ZF122WXp6quvTk8++WTq2LFj/oxHQC6LgPSvf/0rPfDAA+nuu+/Oweuoo45ajK+ihYkS4LR8gwcPLnXp0mWO++JtcMcdd8z1Z0877bTSOuusU2fbAQccUNpll10avJ00fp//5S9/KbVq1ao0YsSI2m1XXXVVqXPnzqXJkyfn+/q8eVtxxRVLl1xyyVz3X3nllaVu3brV9nc4/fTTS/369VtMLaShbbbZZqVjjz229v706dNLvXv3Ll1wwQUVbRcN4+yzzy5tsMEGc9w3ZsyYUtu2bUu33XZb7bZXX301/7/98ccfX4ytpKHMel42Y8aMUs+ePUs//elP6/R7+/btS7/97W/z/X//+9/5555++unax9x7772lmpqa0vvvv7+YX0HLYCSJLIbpl1122bTZZpul66+/Pg/rlj3++ON5WLco/noR22l+ot/WW2+91KNHjzr9GaML8Reo8mP0efMW0+ti6s1GG22UR4qK0ymjH7fZZpvUrl27Ov07dOjQ9Mknn1SoxSysGBWOEcHiZ7ZVq1b5vs9syxFTq2Iq1iqrrJJHDGLadIi+nzp1ap3+j6l4ffv21f8tRMzuGDFiRJ0+7tKlS55WW+7j+BpT7L7whS/UPiYeH/8WxMgTC67NQvwMLXBazg477JDXp/z1r39NxxxzTJ7DesIJJ+T98cEsnlCHuB8n1RMnTsxzY2k+5taf5X3zeow+bx7isxvryZZeeuk8pfLMM8/M03Muvvji2v5deeWV5/oe6NatW0XazcL58MMP8zTaOX1m//Of/1SsXTScOBmOadH9+vXLn+VBgwblNYWvvPJK/szGHzxmXYMa/V/+N53mrdyPc/qMF/+/HWuMi9q0aZP/P+B9sHCEpGbojDPOSD/+8Y/n+ZhXX321zqLOeTnrrLNqv4+/Ose82PjLczkk0fL6nJb9Hoi1KWXrr79+PoH61re+lS644IJciANoXnbbbbc6n+kITbGm8Pe//70/WkEjEZKaoZNPPjkdeuih83xMDMcvrPjH94c//GGaPHlyPqHq2bPnbFVy4n5UT/GPc/Pr8+jPWatelfs39pW/6vOW8x6Iz3RMt3vrrbfyX6Ln1r/F9wDNR0yVbt269Rz7VH+2TDFqtMYaa6Q33ngj7bTTTnnK5ZgxY+qMJun/lqPcj9GnUd2uLO5vuOGGtY+ZtVBL/LsfFe+8DxaOkNQMde/ePd8aS1Qzi+k25b849+/fP/3lL3+p85ionBLbaX59Hv0WZcLjH9Py0Hz0ZwSgtddeu/Yx+rzlvAfiMx3z0sv9Hf34ve99L69jaNu2bW3/RoAy1a75iZHCTTbZJA0ZMqS2MumMGTPy/eOOO67SzaMRxJT4N998M33jG9/IfR+f4+jvKP0dYn1hrFnyb3bLENOjI+hEH5dDUUx/j7VG5Uq20dcRlGONWrwnwoMPPpj/LYg/lLEQKl05gsb19ttvl55//vnSoEGDSp06dcrfx238+PF5/5///OfStddeW3r55ZdLr7/+eq56teSSS5Z+8IMf1D7Hf//737zt1FNPzRVzrrjiilLr1q1L9913XwVfGQvb59OmTSutu+66pZ133rn0wgsv5H7s3r176cwzz6x9Dn3efP3zn//Mle2ib998883STTfdlPv3kEMOqVMVqUePHqVvfOMbpVdeeaV066235v7+5S9/WdG2s/CiD6PS1Q033JCrXB111FGlrl271qliSfN18sknlx5++OHSsGHDSo899lhpxx13LC277LKlUaNG5f1HH310qW/fvqUHH3yw9Mwzz5T69++fbzQf8f/o8v+v4/T84osvzt/H/9PDhRdemD/Tf/rTn0ovvfRSaa+99iqtvPLKpYkTJ9Y+x6677lraaKONSk8++WTpH//4R2n11VcvHXjggRV8Vc2bkNTCDRgwIH/YZr099NBDteUhN9xww3wy3bFjx1xi9Oqrr87lY4vi8fG4du3alVZZZZVcXprm2efhrbfeKu22226lJZZYIv+PNv4HPHXq1DrPo8+bp2effba0+eab5/LvHTp0KK211lql888/vzRp0qQ6j3vxxRdLW221VT6xXn755fP/gGnefvGLX+QT5fjMRknwJ554otJNooHEJRh69eqV+zY+r3H/jTfeqN0fJ8rHHHNMLu0ff/DYZ599SsOHD69om1kw8f/cOf2/O/6fXi4DftZZZ+U/cMW/21/60pdKQ4cOrfMcH330UQ5FcU4Xl/U47LDDav9AyoKrif8szAgUAABAS+Q6SQAAAAVCEgAAQIGQBAAAUCAkAQAAFAhJAAAABUISAABAgZAEAABQICQBAAAUCEkAtHhvvfVWqqmpSS+88EKjPH8895133tkozw3A4ickAdDoDj300LT33ntX7Pf36dMnDR8+PK277rr5/sMPP5yDzZgxYyrWJgCarjaVbgAANLbWrVunnj17VroZADQTRpIAqKhHHnkkbbbZZql9+/apV69e6YwzzkjTpk2r3b/ddtulE044IZ122mlp6aWXzmHnnHPOqfMc//nPf9JWW22VOnTokNZee+30t7/9rc4UuOJ0u/h+++23z9u7deuWt8dIV1hppZXSpZdeWue5N9xwwzq/7/XXX0/bbLNN7e964IEHZntN7777btp///1T165dc5v32muv/HsBaB6EJAAq5v33309f/vKX06abbppefPHFdNVVV6XrrrsunXfeeXUed+ONN6aOHTumJ598Mv3kJz9J5557bm04mT59ep7Kt+SSS+b911xzTfre9743z6l3t99+e/5+6NCheRrez3/+83q1d8aMGWnfffdN7dq1y7/r6quvTqeffnqdx0ydOjXtsssuaamllkp///vf02OPPZY6deqUdt111zRlypSFOEoALG6m2wFQMVdeeWUOLZdffnke0VlzzTXTBx98kIPHD37wg9Sq1cy/5a2//vrp7LPPzt+vvvrq+fFDhgxJO+20Uw5Lb775Zl5nVJ5S96Mf/Sjvm9vUuxjdCcstt1we7amvGKGKUav7778/9e7dO287//zz02677Vb7mN/97nc5TP3qV7/KrykMHjw4/55o484777zQxwuAxUNIAqBiXn311dS/f//aMBG++MUvpgkTJqT33nsv9e3btzYkFcW0vFGjRtWOBkXQKq45iul7jdXe+F3lgBSi/UUxIvbGG2/kkaSiSZMm5TAHQNMnJAHQ5LVt27bO/QhVMVrT0GLkqlQqzTZ9bkFEwNtkk03SzTffPNu+7t27L3IbAWh8QhIAFbPWWmvl9UERTMqjSbGGJ0ZhVlhhhXo9R79+/XKhhJEjR6YePXrkbU8//fQ8fybWFJXXM80aYmKNUtm4cePSsGHD6rQ3flc8JkazwhNPPFHnOTbeeOM85S6m8nXu3LlerwGApkXhBgAWi7Fjx+bqcsXbUUcdlUPH8ccfn9f6/OlPf8prjwYOHFi7Hml+Yu3RqquumgYMGJBeeumlHLK+//3v533FaXxFK664Yt539913p9GjR+fRn7DDDjuk3/zmN7ngwssvv5yfM9Ywle24445pjTXWyNtjWl08btYiEQcddFBadtllc0W72B8hK9YiRYW+mEIIQNMnJAGwWERQ2GijjercfvjDH6a//OUv6amnnkobbLBBOvroo9MRRxxRG3LqI0JMlPqOoBNV8o488sja4BJluudk+eWXT4MGDcrlxmP06bjjjsvbzzzzzLTtttumPfbYI+2+++65al4EsLIIbnfccUeaOHFiXvcUvyuKRBRFlb1HH300r6eKSngx+hSvKdYkGVkCaB5qSrNOvgaAZi5Gk+K6SVFAoRhyAKA+hCQAmr0Y3YlrEUV58AhG3/nOd/KFYv/xj39UumkANEMKNwDQ7I0fPz5fW+mdd97J64Fi7dBFF11U6WYB0EwZSQIAAChQuAEAAKBASAIAACgQkgAAAAqEJAAAgAIhCQAAoEBIAgAAKBCSAAAACoQkAACA9Ln/BxUtSCBB7cB7AAAAAElFTkSuQmCC", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Example 1: Find entities near a specific location (Florida coordinates)\n", - "print(\"🌍 GEOSPATIAL QUERY EXAMPLES\")\n", - "print(\"=\" * 50)\n", - "\n", - "# Find entities within 100km of Orlando, Florida\n", - "orlando_lat, orlando_lng = 28.5383, -81.3792\n", - "radius_km = 100\n", - "\n", - "print(f\"\\n🔍 Searching for entities within {radius_km}km of Orlando, FL\")\n", - "print(f\" Center coordinates: {orlando_lat}, {orlando_lng}\")\n", - "\n", - "nearby_entities = client.get_entities_in_region(orlando_lat, orlando_lng, radius_km)\n", - "print(f\" Found: {nearby_entities.count} entities\")\n", - "\n", - "if nearby_entities.entities:\n", - " nearby_df = entities_to_dataframe(nearby_entities.entities)\n", - " print(f\" Query type: {nearby_entities.query_type}\")\n", - " print(f\" Metadata: {nearby_entities.metadata}\")\n", - " \n", - " print(\"\\n📍 Nearby entities:\")\n", - " for i, entity in enumerate(nearby_entities.entities):\n", - " coords = entity.coordinates\n", - " print(f\" {i+1}. {entity.name}\")\n", - " print(f\" Location: {coords.latitude:.4f}, {coords.longitude:.4f}\")\n", - " print(f\" Source: {entity.ber_data_source}\")\n", - " print()\n", - "\n", - "# Example 2: Bounding box query\n", - "print(\"📦 BOUNDING BOX QUERY\")\n", - "print(\"=\" * 30)\n", - "\n", - "# Define a bounding box around Florida\n", - "sw_lat, sw_lng = 25.0, -85.0 # Southwest corner\n", - "ne_lat, ne_lng = 31.0, -80.0 # Northeast corner\n", - "\n", - "print(f\"Searching within bounding box:\")\n", - "print(f\" Southwest: {sw_lat}, {sw_lng}\")\n", - "print(f\" Northeast: {ne_lat}, {ne_lng}\")\n", - "\n", - "bbox_entities = client.find_entities_in_bounding_box(sw_lat, sw_lng, ne_lat, ne_lng)\n", - "print(f\" Found: {bbox_entities.count} entities\")\n", - "\n", - "if bbox_entities.entities:\n", - " bbox_df = entities_to_dataframe(bbox_entities.entities)\n", - " print(f\" Query type: {bbox_entities.query_type}\")\n", - " \n", - " # Visualize the bounding box query results\n", - " plt.figure(figsize=(10, 8))\n", - " \n", - " # Plot all entities in light color\n", - " plt.scatter(entities_df['longitude'], entities_df['latitude'], \n", - " c='lightgray', alpha=0.5, s=30, label='All Entities')\n", - " \n", - " # Plot bounding box entities in bright color\n", - " plt.scatter(bbox_df['longitude'], bbox_df['latitude'], \n", - " c='red', s=100, alpha=0.8, label='Within Bounding Box')\n", - " \n", - " # Draw the bounding box\n", - " bbox_x = [sw_lng, ne_lng, ne_lng, sw_lng, sw_lng]\n", - " bbox_y = [sw_lat, sw_lat, ne_lat, ne_lat, sw_lat]\n", - " plt.plot(bbox_x, bbox_y, 'r--', linewidth=2, label='Bounding Box')\n", - " \n", - " plt.xlabel('Longitude')\n", - " plt.ylabel('Latitude')\n", - " plt.title('Bounding Box Query Results')\n", - " plt.legend()\n", - " plt.grid(True, alpha=0.3)\n", - " plt.show()\n", - "else:\n", - " print(\" No entities found in bounding box\")" - ] - }, - { - "cell_type": "markdown", - "id": "fe5ede07", - "metadata": {}, - "source": [ - "## 7. Filtered Queries and Data Source Analysis\n", - "\n", - "Let's explore filtering entities by different criteria and analyze the results." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "03c0108c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏢 QUERYING BY DATA SOURCE\n", - "========================================\n", - "\n", - "📊 NMDC Data Source:\n", - " Entities found: 1\n", - " Sample entity: DSNY_CoreB_TOP\n", - " Entity types: {'sample'}\n", - "\n", - "📊 MONET Data Source:\n", - " Entities found: 1\n", - " Sample entity: MONet Core 60920_7\n", - " Entity types: {'sample'}\n", - "\n", - "📊 EMSL Data Source:\n", - " Entities found: 1\n", - " Sample entity: EMSL Sample c9405190-e962-4ba5-93f0-e3ff499f4488\n", - " Entity types: {'sample'}\n", - "\n", - "📊 ESS-DIVE Data Source:\n", - " Entities found: 1\n", - " Sample entity: NGEE Arctic Council Site, Mile Marker 71, Alaska\n", - " Entity types: {'unspecified'}\n", - "\n", - "📊 JGI Data Source:\n", - " Entities found: 1\n", - " Sample entity: Hot spring microbial communities from Yellowstone National Park, Wyoming, USA - YNP2 Nymph Lake 10\n", - " Entity types: {'jgi_biosample'}\n", - "\n", - "🏷️ QUERYING BY ENTITY TYPE\n", - "========================================\n", - "\n", - "🔖 'sample' entities:\n", - " Found: 3\n", - " Data sources: {'NMDC', 'MONET', 'EMSL'}\n", - "\n", - "🔖 'sequence' entities:\n", - " Found: 0\n", - "\n", - "🔖 'biodata' entities:\n", - " Found: 0\n", - "\n", - "🔖 'taxon' entities:\n", - " Found: 0\n", - "\n", - "🔍 ADVANCED MONGODB QUERY\n", - "========================================\n", - "Advanced query results: 1 entities\n", - "Sample result: DSNY_CoreB_TOP\n", - "\n", - "🔤 NAME PATTERN SEARCH\n", - "========================================\n", - "Pattern 'DSNY': 1 matches\n", - " • DSNY_CoreB_TOP (NMDC)\n", - "Pattern 'Core': 2 matches\n", - " • DSNY_CoreB_TOP (NMDC)\n", - " • MONet Core 60920_7 (MONET)\n", - "Pattern 'sample': 1 matches\n", - " • EMSL Sample c9405190-e962-4ba5-93f0-e3ff499f4488 (EMSL)\n" - ] - } - ], - "source": [ - "# Query entities by data source\n", - "print(\"🏢 QUERYING BY DATA SOURCE\")\n", - "print(\"=\" * 40)\n", - "\n", - "data_sources = entities_df['ber_data_source'].unique()\n", - "source_dataframes = {}\n", - "\n", - "for source in data_sources:\n", - " try:\n", - " entities_response = client.find_entities_by_source(source)\n", - " source_df = entities_to_dataframe(entities_response.entities)\n", - " source_dataframes[source] = source_df\n", - " \n", - " print(f\"\\n📊 {source} Data Source:\")\n", - " print(f\" Entities found: {entities_response.count}\")\n", - " if entities_response.entities:\n", - " print(f\" Sample entity: {entities_response.entities[0].name}\")\n", - " print(f\" Entity types: {set(source_df['entity_types'].dropna())}\")\n", - " \n", - " except BertronAPIError as e:\n", - " print(f\" Error querying {source}: {e}\")\n", - "\n", - "# Query entities by entity type\n", - "print(f\"\\n🏷️ QUERYING BY ENTITY TYPE\")\n", - "print(\"=\" * 40)\n", - "\n", - "entity_types = ['sample', 'sequence', 'biodata', 'taxon']\n", - "type_dataframes = {}\n", - "\n", - "for entity_type in entity_types:\n", - " try:\n", - " entities_response = client.find_entities_by_entity_type(entity_type)\n", - " type_df = entities_to_dataframe(entities_response.entities)\n", - " type_dataframes[entity_type] = type_df\n", - " \n", - " print(f\"\\n🔖 '{entity_type}' entities:\")\n", - " print(f\" Found: {entities_response.count}\")\n", - " if entities_response.entities:\n", - " sources = set(type_df['ber_data_source'].dropna())\n", - " print(f\" Data sources: {sources}\")\n", - " \n", - " except BertronAPIError as e:\n", - " print(f\" Error querying {entity_type}: {e}\")\n", - "\n", - "# Advanced query using MongoDB syntax\n", - "print(f\"\\n🔍 ADVANCED MONGODB QUERY\")\n", - "print(\"=\" * 40)\n", - "\n", - "try:\n", - " # Find entities with specific characteristics\n", - " advanced_query = {\n", - " \"filter\": {\n", - " \"ber_data_source\": \"NMDC\",\n", - " \"entity_type\": {\"$in\": [\"sample\"]}\n", - " },\n", - " \"limit\": 10\n", - " }\n", - " \n", - " advanced_response = client.find_entities(\n", - " filter_dict=advanced_query[\"filter\"],\n", - " limit=advanced_query[\"limit\"]\n", - " )\n", - " \n", - " print(f\"Advanced query results: {advanced_response.count} entities\")\n", - " if advanced_response.entities:\n", - " advanced_df = entities_to_dataframe(advanced_response.entities)\n", - " print(f\"Sample result: {advanced_response.entities[0].name}\")\n", - " \n", - "except BertronAPIError as e:\n", - " print(f\"Advanced query error: {e}\")\n", - "\n", - "# Search by name pattern\n", - "print(f\"\\n🔤 NAME PATTERN SEARCH\")\n", - "print(\"=\" * 40)\n", - "\n", - "try:\n", - " # Search for entities with specific name patterns\n", - " name_patterns = [\"DSNY\", \"Core\", \"sample\"]\n", - " \n", - " for pattern in name_patterns:\n", - " search_response = client.search_entities_by_name(pattern, case_sensitive=False)\n", - " print(f\"Pattern '{pattern}': {search_response.count} matches\")\n", - " \n", - " if search_response.entities:\n", - " for entity in search_response.entities[:2]: # Show first 2 matches\n", - " print(f\" • {entity.name} ({entity.ber_data_source})\")\n", - " \n", - "except BertronAPIError as e:\n", - " print(f\"Name search error: {e}\")" - ] - }, - { - "cell_type": "markdown", - "id": "2d40c80c", - "metadata": {}, - "source": [ - "## 8. Detailed Entity Examination\n", - "\n", - "Let's examine individual entities in detail and explore the pydantic validation features." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f2e720d9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔍 DETAILED ENTITY EXAMINATION\n", - "==================================================\n", - "Retrieving entity with ID: nmdc:bsm-11-bsf8yq62\n", - "\n", - "📋 ENTITY DETAILS\n", - "------------------------------\n", - "Type: \n", - "Name: DSNY_CoreB_TOP\n", - "ID: nmdc:bsm-11-bsf8yq62\n", - "URI: https://api.microbiomedata.org/biosamples/nmdc%3Absm-11-bsf8yq62\n", - "Data Source: NMDC\n", - "Entity Types: ['sample']\n", - "Description: MONet sample represented in NMDC\n", - "\n", - "🌍 COORDINATE DETAILS\n", - "------------------------------\n", - "Latitude: 28.125842\n", - "Longitude: -81.434174\n", - "Elevation: 24.0 m\n", - "Depth: 0.0 - 0.1 m\n", - "\n", - "🔗 ADDITIONAL INFORMATION\n", - "------------------------------\n", - "Alternative IDs: None\n", - "Alternative Names: None\n", - "Collections: None\n", - "\n", - "✅ PYDANTIC VALIDATION FEATURES\n", - "------------------------------\n", - "Model validation: True\n", - "JSON export: True\n", - "Schema generation: True\n", - "JSON keys: ['ber_data_source', 'coordinates', 'entity_type', 'description', 'id', 'name', 'alt_ids', 'alt_names', 'part_of_collection', 'uri']\n", - "\n", - "Single entity DataFrame shape: (1, 15)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnameuriber_data_sourcedescriptionentity_typeslatitudelongitudeelevationelevation_unitdepthdepth_unitalt_ids_countalt_names_countcollections_count
0nmdc:bsm-11-bsf8yq62DSNY_CoreB_TOPhttps://api.microbiomedata.org/biosamples/nmdc...NMDCMONet sample represented in NMDCsample28.125842-81.43417424.0mNonem000
\n", - "
" - ], - "text/plain": [ - " id name \\\n", - "0 nmdc:bsm-11-bsf8yq62 DSNY_CoreB_TOP \n", - "\n", - " uri ber_data_source \\\n", - "0 https://api.microbiomedata.org/biosamples/nmdc... NMDC \n", - "\n", - " description entity_types latitude longitude \\\n", - "0 MONet sample represented in NMDC sample 28.125842 -81.434174 \n", - "\n", - " elevation elevation_unit depth depth_unit alt_ids_count alt_names_count \\\n", - "0 24.0 m None m 0 0 \n", - "\n", - " collections_count \n", - "0 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR:bertron_client:API request failed: 404 Client Error: Not Found for url: http://localhost:8000/bertron/fake-id-12345\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "❌ ERROR HANDLING DEMONSTRATION\n", - "==================================================\n", - "✅ Caught expected API error: API request failed: 404 Client Error: Not Found for url: http://localhost:8000/bertron/fake-id-12345\n", - "\n", - "📊 FINAL DATASET SUMMARY\n", - "==================================================\n", - "Total entities processed: 5\n", - "DataFrame memory usage: 3.14 KB\n", - "Data types:\n", - " id: object\n", - " name: object\n", - " uri: object\n", - " ber_data_source: object\n", - " description: object\n", - " entity_types: object\n", - " latitude: float64\n", - " longitude: float64\n", - " elevation: float64\n", - " elevation_unit: object\n", - " depth: object\n", - " depth_unit: object\n", - " alt_ids_count: int64\n", - " alt_names_count: int64\n", - " collections_count: int64\n", - "\n", - "DataFrame Info:\n", - "\n", - "RangeIndex: 5 entries, 0 to 4\n", - "Data columns (total 15 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 5 non-null object \n", - " 1 name 5 non-null object \n", - " 2 uri 5 non-null object \n", - " 3 ber_data_source 5 non-null object \n", - " 4 description 4 non-null object \n", - " 5 entity_types 5 non-null object \n", - " 6 latitude 5 non-null float64\n", - " 7 longitude 5 non-null float64\n", - " 8 elevation 3 non-null float64\n", - " 9 elevation_unit 3 non-null object \n", - " 10 depth 0 non-null object \n", - " 11 depth_unit 1 non-null object \n", - " 12 alt_ids_count 5 non-null int64 \n", - " 13 alt_names_count 5 non-null int64 \n", - " 14 collections_count 5 non-null int64 \n", - "dtypes: float64(3), int64(3), object(9)\n", - "memory usage: 732.0+ bytes\n" - ] - } - ], - "source": [ - "# Get a specific entity by ID for detailed examination\n", - "if all_entities_response.entities and all_entities_response.entities[0].id:\n", - " entity_id = all_entities_response.entities[0].id\n", - " \n", - " print(f\"🔍 DETAILED ENTITY EXAMINATION\")\n", - " print(\"=\" * 50)\n", - " print(f\"Retrieving entity with ID: {entity_id}\")\n", - " \n", - " try:\n", - " detailed_entity = client.get_entity_by_id(entity_id)\n", - " \n", - " print(f\"\\n📋 ENTITY DETAILS\")\n", - " print(\"-\" * 30)\n", - " print(f\"Type: {type(detailed_entity)}\")\n", - " print(f\"Name: {detailed_entity.name}\")\n", - " print(f\"ID: {detailed_entity.id}\")\n", - " print(f\"URI: {detailed_entity.uri}\")\n", - " print(f\"Data Source: {detailed_entity.ber_data_source}\")\n", - " print(f\"Entity Types: {detailed_entity.entity_type}\")\n", - " print(f\"Description: {detailed_entity.description}\")\n", - " \n", - " print(f\"\\n🌍 COORDINATE DETAILS\")\n", - " print(\"-\" * 30)\n", - " coords = detailed_entity.coordinates\n", - " print(f\"Latitude: {coords.latitude}\")\n", - " print(f\"Longitude: {coords.longitude}\")\n", - " \n", - " if coords.elevation:\n", - " print(f\"Elevation: {coords.elevation.has_numeric_value} {coords.elevation.has_unit}\")\n", - " if coords.depth:\n", - " depth_val = coords.depth.has_numeric_value\n", - " depth_min = coords.depth.has_minimum_numeric_value\n", - " depth_max = coords.depth.has_maximum_numeric_value\n", - " depth_unit = coords.depth.has_unit\n", - " \n", - " if depth_min is not None and depth_max is not None:\n", - " print(f\"Depth: {depth_min} - {depth_max} {depth_unit}\")\n", - " elif depth_val is not None:\n", - " print(f\"Depth: {depth_val} {depth_unit}\")\n", - " \n", - " print(f\"\\n🔗 ADDITIONAL INFORMATION\")\n", - " print(\"-\" * 30)\n", - " print(f\"Alternative IDs: {detailed_entity.alt_ids}\")\n", - " print(f\"Alternative Names: {detailed_entity.alt_names}\")\n", - " print(f\"Collections: {detailed_entity.part_of_collection}\")\n", - " \n", - " # Demonstrate pydantic validation\n", - " print(f\"\\n✅ PYDANTIC VALIDATION FEATURES\")\n", - " print(\"-\" * 30)\n", - " print(f\"Model validation: {hasattr(detailed_entity, 'model_validate')}\")\n", - " print(f\"JSON export: {hasattr(detailed_entity, 'model_dump')}\")\n", - " print(f\"Schema generation: {hasattr(detailed_entity, 'model_json_schema')}\")\n", - " \n", - " # Export to JSON\n", - " entity_json = detailed_entity.model_dump()\n", - " print(f\"JSON keys: {list(entity_json.keys())}\")\n", - " \n", - " # Create a DataFrame with just this entity for demonstration\n", - " single_entity_df = entities_to_dataframe([detailed_entity])\n", - " print(f\"\\nSingle entity DataFrame shape: {single_entity_df.shape}\")\n", - " display(single_entity_df)\n", - " \n", - " except BertronAPIError as e:\n", - " print(f\"Error retrieving entity: {e}\")\n", - "else:\n", - " print(\"⚠️ No entity ID available for detailed examination\")\n", - "\n", - "# Demonstrate error handling\n", - "print(f\"\\n❌ ERROR HANDLING DEMONSTRATION\")\n", - "print(\"=\" * 50)\n", - "\n", - "try:\n", - " # Try to get a non-existent entity\n", - " fake_entity = client.get_entity_by_id(\"fake-id-12345\")\n", - "except BertronAPIError as e:\n", - " print(f\"✅ Caught expected API error: {e}\")\n", - "except Exception as e:\n", - " print(f\"❌ Unexpected error: {e}\")\n", - "\n", - "# Summary statistics for the entire dataset\n", - "print(f\"\\n📊 FINAL DATASET SUMMARY\")\n", - "print(\"=\" * 50)\n", - "print(f\"Total entities processed: {len(entities_df)}\")\n", - "print(f\"DataFrame memory usage: {entities_df.memory_usage(deep=True).sum() / 1024:.2f} KB\")\n", - "print(f\"Data types:\")\n", - "for col, dtype in entities_df.dtypes.items():\n", - " print(f\" {col}: {dtype}\")\n", - "\n", - "# Show the complete DataFrame info\n", - "print(f\"\\nDataFrame Info:\")\n", - "entities_df.info()" - ] - }, - { - "cell_type": "markdown", - "id": "bee726a0", - "metadata": {}, - "source": [ - "## 9. Conclusion\n", - "\n", - "This notebook has demonstrated the comprehensive functionality of the BERtron Python client, including:\n", - "\n", - "### ✅ **Features Demonstrated**\n", - "- **Client Initialization**: Connected to BERtron API and tested health status\n", - "- **Data Retrieval**: Retrieved all entities using the `get_all_entities()` method\n", - "- **DataFrame Conversion**: Converted pydantic Entity objects to pandas DataFrames for analysis\n", - "- **Data Analysis**: Performed statistical analysis and created visualizations\n", - "- **Geospatial Queries**: Used both nearby searches and bounding box queries\n", - "- **Filtered Queries**: Filtered by data source, entity type, and name patterns\n", - "- **Advanced Queries**: Demonstrated MongoDB-style query syntax\n", - "- **Entity Details**: Examined individual entities with full type safety\n", - "- **Error Handling**: Showed proper exception handling for API errors\n", - "\n", - "### 🚀 **Key Benefits**\n", - "- **Type Safety**: Full pydantic validation ensures data integrity\n", - "- **Easy Integration**: Simple conversion to pandas for data science workflows \n", - "- **Rich Querying**: Support for geospatial, filtered, and advanced queries\n", - "- **Structured Data**: Well-organized coordinates and metadata\n", - "- **Error Resilience**: Robust error handling for production use\n", - "\n", - "### 🔗 **Next Steps**\n", - "- Export data to different formats (CSV, JSON, etc.)\n", - "- Integrate with other geospatial libraries (folium, geopandas)\n", - "- Create more complex analytical workflows\n", - "- Build interactive dashboards using the client\n", - "\n", - "The BERtron client successfully bridges the gap between the BER data ecosystem and modern Python data science tools! 🎉" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "BERtron (Python 3.13)", - "language": "python", - "name": "bertron" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docker-compose.yml b/docker-compose.yml index 2da64b5..1068267 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,8 +9,8 @@ services: # host environment. # Docs: https://docs.docker.com/compose/how-tos/environment-variables/variable-interpolation/#additional-information environment: - MONGO_HOST: ${MONGO_HOST:?} - MONGO_PORT: ${MONGO_PORT:?} + MONGO_HOST: mongo + MONGO_PORT: 27017 MONGO_USERNAME: ${MONGO_USERNAME:?} MONGO_PASSWORD: ${MONGO_PASSWORD:?} MONGO_DATABASE: ${MONGO_DATABASE:?} @@ -29,6 +29,12 @@ services: volumes: # Mount the root directory of the repository, at `/app` within the container. - ".:/app" + # Create an anonymous volume to mask the host's Python virtual environment when mounting. + # That way, the host's Python virtual environment does not interfere with the container's + # and vice versa, and the container does not have to customize `VIRTUAL_ENV`. + # TODO: Consider using this approach for others services that use a Python virtual environment. + # Sharing the `.venv` directory between host and container can be problematic. + - "/app/.venv" mongo: image: mongo:8.0.11 @@ -48,32 +54,44 @@ services: ingest: # Use the same container image as the app service for consistency - build: { context: ".", dockerfile: Dockerfile, target: development } + build: { context: ".", dockerfile: Dockerfile, target: test } # This service should not start automatically - only run on demand profiles: ["tools"] + environment: + # Note: We use `VIRTUAL_ENV` to customize the path at which `uv` looks for and, + # if necessary, creates a Python virtual environment. By using a path + # outside of `/app`, we avoid interfering with—and using—any Python + # virtual environment the host might have created at `/app/.venv`. + # Reference: https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments + VIRTUAL_ENV: /app_venv volumes: - # Mount the root directory to access the ingest script and data files - - ".:/app" + - ".:/app" # Need to mount current directory to pick up uv install files + - "./tests/data:/test_data" # to access the test data files depends_on: - mongo - # Run ingest with data dir mounted to /data - command: ["uv", "run", "python", "/app/mongodb/ingest_data.py", "--mongo-uri", "mongodb://admin:root@mongo:27017", "--input", "/data", "--clean"] + # Run ingest with data dir mounted to /test_data + command: ["uv", "run", "--active", "python", "/app/mongodb/ingest_data.py", "--mongo-uri", "mongodb://${MONGO_USERNAME}:${MONGO_PASSWORD}@mongo:27017", "--input", "/test_data", "--clean"] test: # Use the same container image as the app service for consistency - build: { context: ".", dockerfile: Dockerfile, target: development } + build: { context: ".", dockerfile: Dockerfile, target: test } # This service should not start automatically - only run on demand profiles: ["tools"] + volumes: + # Mount the root directory to access the ingest script and data files + - ".:/app" environment: - MONGO_HOST: ${MONGO_HOST:?} - MONGO_PORT: ${MONGO_PORT:?} + MONGO_HOST: mongo + MONGO_PORT: 27017 MONGO_USERNAME: ${MONGO_USERNAME:?} MONGO_PASSWORD: ${MONGO_PASSWORD:?} - MONGO_DATABASE: ${MONGO_DATABASE:?} # the test suite will disregard this + MONGO_DATABASE: ${MONGO_DATABASE:?} # reminder: the test suite patches this value + VIRTUAL_ENV: /app_venv depends_on: - app - mongo - command: ["uv", "run", "pytest", "-v"] + command: ["uv", "run", "--active", "pytest", "-v"] + volumes: # Define a named volume that will contain MongoDB data. diff --git a/mongodb/ingest_data.py b/mongodb/ingest_data.py index e71f852..575d9e5 100644 --- a/mongodb/ingest_data.py +++ b/mongodb/ingest_data.py @@ -6,49 +6,54 @@ import os import sys from datetime import datetime -from typing import Dict, List, Any, Optional +from typing import Dict, Optional +from schema.datamodel.bertron_schema_pydantic import Entity -import pymongo +from pymongo import MongoClient, GEOSPHERE +from pymongo.database import Database from pymongo.errors import ConnectionFailure, PyMongoError from jsonschema import validate, ValidationError -import requests +import httpx + # Set up logging logging.basicConfig( level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[logging.StreamHandler()] + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], ) -logger = logging.getLogger('bertron-ingest') +logger = logging.getLogger("bertron-ingest") class BertronMongoDBIngestor: """Class to handle ingestion of BERtron data into MongoDB.""" - + def __init__(self, mongo_uri: str, db_name: str, schema_path: str): """Initialize the ingestor with connection and schema details.""" - self.mongo_uri = mongo_uri - self.db_name = db_name - self.schema_path = schema_path - self.client = None - self.db = None - self.schema = None - + self.mongo_uri: str = mongo_uri + self.db_name: str = db_name + self.schema_path: Optional[str] = schema_path + self.client: Optional[MongoClient] = None + self.db: Optional[Database] = None + self.schema: Optional[dict] = None + def connect(self) -> None: """Connect to MongoDB.""" try: logger.info(f"Connecting to MongoDB at {self.mongo_uri}") - self.client = pymongo.MongoClient(self.mongo_uri) + self.client = MongoClient(self.mongo_uri) + logger.info(f"Using MongoDB database: {self.db_name}") self.db = self.client[self.db_name] except ConnectionFailure as e: logger.error(f"Failed to connect to MongoDB: {e}") sys.exit(1) - + def clean_collections(self) -> None: """Delete existing collections to start fresh.""" + assert self.db is not None, "Connection to database has not been established" try: collection_names = self.db.list_collection_names() - if 'entities' in collection_names: + if "entities" in collection_names: logger.info("Dropping existing 'entities' collection") self.db.entities.drop() logger.info("Successfully dropped 'entities' collection") @@ -57,162 +62,181 @@ def clean_collections(self) -> None: except PyMongoError as e: logger.error(f"Error dropping collections: {e}") sys.exit(1) - + def load_schema(self) -> Dict: """Load the JSON schema from file.""" + assert isinstance(self.schema_path, str), "Schema path has not been set" try: logger.info(f"Loading schema from {self.schema_path}") - if self.schema_path.startswith('http://') or self.schema_path.startswith('https://'): - response = requests.get(self.schema_path) + if self.schema_path.startswith(("http://", "https://")): + response = httpx.get(self.schema_path) response.raise_for_status() self.schema = response.json() else: - with open(self.schema_path, 'r') as f: + with open(self.schema_path, "r") as f: self.schema = json.load(f) + if not isinstance(self.schema, dict): + raise ValueError("Failed to parse schema into a Python dictionary") return self.schema except (FileNotFoundError, json.JSONDecodeError) as e: logger.error(f"Failed to load schema: {e}") sys.exit(1) - + def validate_data(self, data: Dict) -> bool: """Validate data against the loaded schema.""" + assert isinstance(self.schema, dict), "Schema has not been loaded" try: validate(instance=data, schema=self.schema) + _ = Entity(**data) # Validate against Pydantic model return True except ValidationError as e: logger.error(f"Validation error: {e}") return False - + def insert_entity(self, entity: Dict) -> Optional[str]: """Insert an entity into the 'entities' collection.""" + assert isinstance(self.schema, dict), "Schema has not been loaded" + assert self.db is not None, "Connection to database has not been established" try: # Add metadata - entity['_metadata'] = { - 'ingested_at': datetime.utcnow(), - 'schema_version': self.schema.get('version', 'unknown') + entity["_metadata"] = { + "ingested_at": datetime.utcnow(), + "schema_version": self.schema.get("version", "unknown"), } - + # convert latitude and longitude to mongoDB GeoJSON format - if 'coordinates' in entity: - coordinates = entity['coordinates'] - if isinstance(coordinates, dict) and 'latitude' in coordinates and 'longitude' in coordinates: - entity['geojson'] = { - 'type': 'Point', - 'coordinates': [coordinates['longitude'], coordinates['latitude']] + if "coordinates" in entity: + coordinates = entity["coordinates"] + if ( + isinstance(coordinates, dict) + and "latitude" in coordinates + and "longitude" in coordinates + ): + entity["geojson"] = { + "type": "Point", + "coordinates": [ + coordinates["longitude"], + coordinates["latitude"], + ], } else: - logger.error(f"Invalid coordinates format for entity: {entity.get('name', entity.get('id', 'unnamed'))}") + logger.error( + f"Invalid coordinates format for entity: {entity.get('name', entity.get('id', 'unnamed'))}" + ) return None - # Create indexes for common query patterns - self.db.entities.create_index('uri', unique=True) - self.db.entities.create_index('ber_data_source') - self.db.entities.create_index('data_type') - + self.db.entities.create_index("uri", unique=True) + self.db.entities.create_index("ber_data_source") + self.db.entities.create_index("data_type") + # Create 2dsphere index for geospatial queries on coordinates - self.db.entities.create_index([('geojson', pymongo.GEOSPHERE)]) - + self.db.entities.create_index([("geojson", GEOSPHERE)]) + # Insert with upsert to handle potential duplicates based on URI result = self.db.entities.update_one( - {'uri': entity['uri']}, - {'$set': entity}, - upsert=True + {"uri": entity["uri"]}, {"$set": entity}, upsert=True ) - + if result.upserted_id: - logger.info(f"Inserted entity: {entity.get('name', entity.get('id', 'unnamed'))}") + logger.info( + f"Inserted entity: {entity.get('name', entity.get('id', 'unnamed'))}" + ) return str(result.upserted_id) else: - logger.info(f"Updated entity: {entity.get('name', entity.get('id', 'unnamed'))}") + logger.info( + f"Updated entity: {entity.get('name', entity.get('id', 'unnamed'))}" + ) return None except PyMongoError as e: logger.error(f"Error inserting entity: {e}") return None - + def ingest_file(self, filepath: str) -> Dict[str, int]: """Ingest entities from a JSON file.""" - stats = { - 'processed': 0, - 'valid': 0, - 'invalid': 0, - 'inserted': 0, - 'error': 0 - } - + stats = {"processed": 0, "valid": 0, "invalid": 0, "inserted": 0, "error": 0} + try: - with open(filepath, 'r') as f: + with open(filepath, "r") as f: data = json.load(f) - + # Handle both single entity and array of entities entities = data if isinstance(data, list) else [data] - stats['processed'] = len(entities) - + stats["processed"] = len(entities) + for entity in entities: if self.validate_data(entity): - stats['valid'] += 1 + stats["valid"] += 1 if self.insert_entity(entity): - stats['inserted'] += 1 + stats["inserted"] += 1 else: - stats['invalid'] += 1 - + stats["invalid"] += 1 + except (FileNotFoundError, json.JSONDecodeError) as e: logger.error(f"Error processing file {filepath}: {e}") - stats['error'] += 1 - + stats["error"] += 1 + return stats - + def close(self) -> None: """Close the MongoDB connection.""" if self.client: self.client.close() logger.info("MongoDB connection closed") - + def main(): """Main function to run the ingestor.""" - parser = argparse.ArgumentParser(description='Ingest data into MongoDB based on BERtron schema') - parser.add_argument('--mongo-uri', default='mongodb://localhost:27017', - help='MongoDB connection URI') - parser.add_argument('--db-name', default='bertron', - help='MongoDB database name') - parser.add_argument('--schema-path', - default='https://raw.githubusercontent.com/ber-data/bertron-schema/refs/heads/main/src/schema/jsonschema/bertron_schema.json', - help='Path or URL to the BERtron schema JSON file') - parser.add_argument('--input', required=True, - help='Path to the input JSON file or directory') - parser.add_argument('--clean', action='store_true', - help='Delete existing collections before ingesting new data') - + parser = argparse.ArgumentParser( + description="Ingest data into MongoDB based on BERtron schema" + ) + parser.add_argument( + "--mongo-uri", + default="mongodb://localhost:27017", + help="MongoDB connection URI", + ) + parser.add_argument("--db-name", default="bertron", help="MongoDB database name") + parser.add_argument( + "--schema-path", + default="https://raw.githubusercontent.com/ber-data/bertron-schema/refs/heads/main/src/schema/jsonschema/bertron_schema.json", + help="Path or URL to the BERtron schema JSON file", + ) + parser.add_argument( + "--input", required=True, help="Path to the input JSON file or directory" + ) + parser.add_argument( + "--clean", + action="store_true", + help="Delete existing collections before ingesting new data", + ) + args = parser.parse_args() - + ingestor = BertronMongoDBIngestor( - mongo_uri=args.mongo_uri, - db_name=args.db_name, - schema_path=args.schema_path + mongo_uri=args.mongo_uri, db_name=args.db_name, schema_path=args.schema_path ) - + try: ingestor.connect() ingestor.load_schema() - + # Clean collections if requested if args.clean: logger.info("Clean flag enabled - removing existing collections") ingestor.clean_collections() - + total_stats = { - 'processed': 0, - 'valid': 0, - 'invalid': 0, - 'inserted': 0, - 'error': 0 + "processed": 0, + "valid": 0, + "invalid": 0, + "inserted": 0, + "error": 0, } - + # Process a single file or all JSON files in a directory if os.path.isdir(args.input): for filename in os.listdir(args.input): - if filename.endswith('.json'): + if filename.endswith(".json"): file_path = os.path.join(args.input, filename) logger.info(f"Processing file: {file_path}") stats = ingestor.ingest_file(file_path) @@ -222,7 +246,7 @@ def main(): # Process a single file logger.info(f"Processing file: {args.input}") total_stats = ingestor.ingest_file(args.input) - + # Report results logger.info("Ingestion completed") logger.info(f"Total processed: {total_stats['processed']}") @@ -230,7 +254,7 @@ def main(): logger.info(f"Invalid entities: {total_stats['invalid']}") logger.info(f"Inserted entities: {total_stats['inserted']}") logger.info(f"Errors: {total_stats['error']}") - + finally: ingestor.close() diff --git a/mongodb/legacy/geo_importer.py b/mongodb/legacy/geo_importer.py index 9319558..e7584ad 100644 --- a/mongodb/legacy/geo_importer.py +++ b/mongodb/legacy/geo_importer.py @@ -4,7 +4,7 @@ This script imports geospatial data from three different sources: 1. latlon_project_ids.json - Project location data -2. ess_dive_packages.csv - ESS-DIVE package centroids +2. ess_dive_packages.csv - ESS-DIVE package centroids 3. nmdc_biosample_geo_coordinates.csv - NMDC biosample locations Usage: @@ -24,323 +24,338 @@ # Configure logging logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) -logger = logging.getLogger('geo-importer') +logger = logging.getLogger("geo-importer") class MongoDBImporter: """MongoDB geospatial data importer.""" - + def __init__(self, connection_string: str = "mongodb://localhost:27017"): """Initialize MongoDB connection. - + Args: connection_string: MongoDB connection URI """ self.client = MongoClient(connection_string) self.db = self.client.geospatialDB self.collection = self.db.locations - + # Ensure indexes self._create_indexes() - + def _create_indexes(self) -> None: """Create necessary indexes on the collection.""" self.collection.create_index([("coordinates", GEOSPHERE)]) self.collection.create_index("dataset_id") self.collection.create_index("system_name") logger.info("Database indexes created or verified") - + def import_proposal_locations(self, file_path: str) -> int: """Import data from the proposal locations JSON file. - + Args: file_path: Path to the latlon_project_ids.json file - + Returns: Number of documents imported """ logger.info(f"Processing proposal locations from {file_path}") - + try: - with open(file_path, 'r') as f: + with open(file_path, "r") as f: data = json.load(f) - + if not data: logger.warning("Empty proposal data file") return 0 - + # Transform the data into MongoDB documents documents = [] for item in data: try: - latitude = float(item.get('latitude')) - longitude = float(item.get('longitude')) - + latitude = float(item.get("latitude")) + longitude = float(item.get("longitude")) + if not (latitude and longitude): logger.warning(f"Missing coordinates in item: {item}") continue - - documents.append({ - 'dataset_id': item.get('proposal_id'), - 'system_name': "EMSL", - 'coordinates': { - 'type': 'Point', - 'coordinates': [longitude, latitude] - }, - 'metadata': { - 'sampling_set': item.get('sampling_set'), - 'description': item.get('description'), - 'source': 'project_locations' + + documents.append( + { + "dataset_id": item.get("proposal_id"), + "system_name": "EMSL", + "coordinates": { + "type": "Point", + "coordinates": [longitude, latitude], + }, + "metadata": { + "sampling_set": item.get("sampling_set"), + "description": item.get("description"), + "source": "project_locations", + }, } - }) + ) except (ValueError, TypeError) as e: logger.warning(f"Error processing item {item}: {e}") continue - + if documents: result = self.collection.insert_many(documents) - logger.info(f"Inserted {len(result.inserted_ids)} proposal location documents") + logger.info( + f"Inserted {len(result.inserted_ids)} proposal location documents" + ) return len(result.inserted_ids) else: logger.warning("No valid proposal documents to insert") return 0 - + except Exception as e: logger.error(f"Error importing proposal locations: {e}") raise - + def import_ess_dive_packages(self, file_path: str) -> int: """Import data from the ESS-DIVE packages CSV file. - + Args: file_path: Path to the ess_dive_packages.csv file - + Returns: Number of documents imported """ logger.info(f"Processing ESS-DIVE packages from {file_path}") - + try: # Use pandas for efficient CSV handling df = pd.read_csv(file_path) - + if df.empty: logger.warning("Empty ESS-DIVE data file") return 0 - + # Transform into MongoDB documents documents = [] for _, row in df.iterrows(): try: - latitude = float(row.get('centroid_latitude')) - longitude = float(row.get('centroid_longitude')) - + latitude = float(row.get("centroid_latitude")) + longitude = float(row.get("centroid_longitude")) + if pd.isna(latitude) or pd.isna(longitude): continue - - documents.append({ - 'dataset_id': row.get('package_id'), - 'system_name': 'ESSDIVE', - 'coordinates': { - 'type': 'Point', - 'coordinates': [longitude, latitude] - }, - 'metadata': { - 'source': 'ESS-DIVE', - 'row_id': int(row.get('Unnamed: 0')) if not pd.isna(row.get('Unnamed: 0')) else None + + documents.append( + { + "dataset_id": row.get("package_id"), + "system_name": "ESSDIVE", + "coordinates": { + "type": "Point", + "coordinates": [longitude, latitude], + }, + "metadata": { + "source": "ESS-DIVE", + "row_id": int(row.get("Unnamed: 0")) + if not pd.isna(row.get("Unnamed: 0")) + else None, + }, } - }) + ) except (ValueError, TypeError) as e: logger.warning(f"Error processing ESS-DIVE row: {e}") continue - + if documents: # Use bulk insert for better performance result = self.collection.insert_many(documents) - logger.info(f"Inserted {len(result.inserted_ids)} ESS-DIVE package documents") + logger.info( + f"Inserted {len(result.inserted_ids)} ESS-DIVE package documents" + ) return len(result.inserted_ids) else: logger.warning("No valid ESS-DIVE documents to insert") return 0 - + except Exception as e: logger.error(f"Error importing ESS-DIVE packages: {e}") raise - + def import_nmdc_biosamples(self, file_path: str) -> int: """Import data from the NMDC biosample coordinates CSV file. - + Args: file_path: Path to the nmdc_biosample_geo_coordinates.csv file - + Returns: Number of documents imported """ logger.info(f"Processing NMDC biosamples from {file_path}") - + try: # Use pandas for efficient CSV handling df = pd.read_csv(file_path) - + if df.empty: logger.warning("Empty NMDC biosample data file") return 0 - + # Transform into MongoDB documents documents = [] for _, row in df.iterrows(): try: - latitude = float(row.get('latitude')) - longitude = float(row.get('longitude')) - + latitude = float(row.get("latitude")) + longitude = float(row.get("longitude")) + if pd.isna(latitude) or pd.isna(longitude): continue - - documents.append({ - 'dataset_id': row.get('biosample_id'), - 'system_name': 'NMDC', - 'coordinates': { - 'type': 'Point', - 'coordinates': [longitude, latitude] - }, - 'metadata': { - 'source': 'NMDC-Biosample' + + documents.append( + { + "dataset_id": row.get("biosample_id"), + "system_name": "NMDC", + "coordinates": { + "type": "Point", + "coordinates": [longitude, latitude], + }, + "metadata": {"source": "NMDC-Biosample"}, } - }) + ) except (ValueError, TypeError) as e: logger.warning(f"Error processing NMDC biosample row: {e}") continue - + if documents: # Use bulk insert for better performance result = self.collection.insert_many(documents) - logger.info(f"Inserted {len(result.inserted_ids)} NMDC biosample documents") + logger.info( + f"Inserted {len(result.inserted_ids)} NMDC biosample documents" + ) return len(result.inserted_ids) else: logger.warning("No valid NMDC biosample documents to insert") return 0 - + except Exception as e: logger.error(f"Error importing NMDC biosamples: {e}") raise - + def import_jgi_gold_biosamples(self, file_path: str) -> int: """Import data from the JGI GOLD biosample coordinates CSV file. - + Args: file_path: Path to the jgi_gold_biosample_geo.csv file - + Returns: Number of documents imported """ logger.info(f"Processing JGI GOLD biosamples from {file_path}") - + try: # Use pandas for efficient CSV handling df = pd.read_csv(file_path) - + if df.empty: logger.warning("Empty JGI GOLD biosample data file") return 0 - + # Transform into MongoDB documents documents = [] for _, row in df.iterrows(): try: - latitude = float(row.get('latitude')) - longitude = float(row.get('longitude')) - + latitude = float(row.get("latitude")) + longitude = float(row.get("longitude")) + if pd.isna(latitude) or pd.isna(longitude): continue - - documents.append({ - 'dataset_id': row.get('gold_id'), - 'system_name': 'JGI-Biosamples', - 'coordinates': { - 'type': 'Point', - 'coordinates': [longitude, latitude] - }, - 'metadata': { - 'source': 'JGI-GOLD-Biosample' + + documents.append( + { + "dataset_id": row.get("gold_id"), + "system_name": "JGI-Biosamples", + "coordinates": { + "type": "Point", + "coordinates": [longitude, latitude], + }, + "metadata": {"source": "JGI-GOLD-Biosample"}, } - }) + ) except (ValueError, TypeError) as e: logger.warning(f"Error processing JGI GOLD biosample row: {e}") continue - + if documents: # Use bulk insert for better performance result = self.collection.insert_many(documents) - logger.info(f"Inserted {len(result.inserted_ids)} JGI GOLD biosample documents") + logger.info( + f"Inserted {len(result.inserted_ids)} JGI GOLD biosample documents" + ) return len(result.inserted_ids) else: logger.warning("No valid JGI GOLD biosample documents to insert") return 0 - + except Exception as e: logger.error(f"Error importing JGI GOLD biosamples: {e}") raise - + def import_jgi_gold_organisms(self, file_path: str) -> int: """Import data from the JGI GOLD organism coordinates CSV file. - + Args: file_path: Path to the jgi_gold_organism_geo.csv file - + Returns: Number of documents imported """ logger.info(f"Processing JGI GOLD organisms from {file_path}") - + try: # Use pandas for efficient CSV handling df = pd.read_csv(file_path) - + if df.empty: logger.warning("Empty JGI GOLD organism data file") return 0 - + # Transform into MongoDB documents documents = [] for _, row in df.iterrows(): try: - latitude = float(row.get('latitude')) - longitude = float(row.get('longitude')) - + latitude = float(row.get("latitude")) + longitude = float(row.get("longitude")) + if pd.isna(latitude) or pd.isna(longitude): continue - - documents.append({ - 'dataset_id': row.get('gold_id'), - 'system_name': 'JGI-Organism', - 'coordinates': { - 'type': 'Point', - 'coordinates': [longitude, latitude] - }, - 'metadata': { - 'source': 'JGI-GOLD-Organism' + + documents.append( + { + "dataset_id": row.get("gold_id"), + "system_name": "JGI-Organism", + "coordinates": { + "type": "Point", + "coordinates": [longitude, latitude], + }, + "metadata": {"source": "JGI-GOLD-Organism"}, } - }) + ) except (ValueError, TypeError) as e: logger.warning(f"Error processing JGI GOLD organism row: {e}") continue - + if documents: # Use bulk insert for better performance result = self.collection.insert_many(documents) - logger.info(f"Inserted {len(result.inserted_ids)} JGI GOLD organism documents") + logger.info( + f"Inserted {len(result.inserted_ids)} JGI GOLD organism documents" + ) return len(result.inserted_ids) else: logger.warning("No valid JGI GOLD organism documents to insert") return 0 - + except Exception as e: logger.error(f"Error importing JGI GOLD organisms: {e}") raise - + def close(self) -> None: """Close the MongoDB connection.""" self.client.close() @@ -349,77 +364,88 @@ def close(self) -> None: def validate_file(file_path: str) -> bool: """Check if file exists and is readable. - + Args: file_path: Path to the file to check - + Returns: True if file exists and is readable, False otherwise """ if not os.path.exists(file_path): logger.warning(f"File not found: {file_path}") return False - + if not os.path.isfile(file_path): logger.warning(f"Not a file: {file_path}") return False - + if not os.access(file_path, os.R_OK): logger.warning(f"File not readable: {file_path}") return False - + return True def main(): """Main function to run the import process.""" - parser = argparse.ArgumentParser(description='Import geospatial data into MongoDB') - parser.add_argument('--data-dir', type=str, default='./data', - help='Directory containing data files') - parser.add_argument('--mongodb-uri', type=str, default='mongodb://localhost:27017', - help='MongoDB connection string') - parser.add_argument('--clear-collection', action='store_true', - help='Clear the collection before importing') - parser.add_argument('--skip-large-files', action='store_true', - help='Skip large JGI GOLD files (useful for testing)') + parser = argparse.ArgumentParser(description="Import geospatial data into MongoDB") + parser.add_argument( + "--data-dir", type=str, default="./data", help="Directory containing data files" + ) + parser.add_argument( + "--mongodb-uri", + type=str, + default="mongodb://localhost:27017", + help="MongoDB connection string", + ) + parser.add_argument( + "--clear-collection", + action="store_true", + help="Clear the collection before importing", + ) + parser.add_argument( + "--skip-large-files", + action="store_true", + help="Skip large JGI GOLD files (useful for testing)", + ) args = parser.parse_args() - + # Check data directory if not os.path.exists(args.data_dir): logger.error(f"Data directory does not exist: {args.data_dir}") return 1 - + # Set up file paths - proposal_file = os.path.join(args.data_dir, 'latlon_project_ids.json') - ess_dive_file = os.path.join(args.data_dir, 'ess_dive_packages.csv') - nmdc_file = os.path.join(args.data_dir, 'nmdc_biosample_geo_coordinates.csv') - jgi_biosample_file = os.path.join(args.data_dir, 'jgi_gold_biosample_geo.csv') - jgi_organism_file = os.path.join(args.data_dir, 'jgi_gold_organism_geo.csv') - + proposal_file = os.path.join(args.data_dir, "latlon_project_ids.json") + ess_dive_file = os.path.join(args.data_dir, "ess_dive_packages.csv") + nmdc_file = os.path.join(args.data_dir, "nmdc_biosample_geo_coordinates.csv") + jgi_biosample_file = os.path.join(args.data_dir, "jgi_gold_biosample_geo.csv") + jgi_organism_file = os.path.join(args.data_dir, "jgi_gold_organism_geo.csv") + # Validate files files_valid = [ validate_file(proposal_file), validate_file(ess_dive_file), validate_file(nmdc_file), validate_file(jgi_biosample_file), - validate_file(jgi_organism_file) + validate_file(jgi_organism_file), ] - + if not any(files_valid): logger.error("No valid files found to import") return 1 - + # Initialize MongoDB importer importer = MongoDBImporter(args.mongodb_uri) - + # Clear collection if requested if args.clear_collection: logger.info("Clearing collection before import") importer.collection.delete_many({}) - + # Import each file if valid total_imported = 0 - + if files_valid[0]: try: logger.info("Importing proposal locations...") @@ -427,7 +453,7 @@ def main(): total_imported += count except Exception as e: logger.error(f"Failed to import proposal locations: {e}") - + if files_valid[1]: try: logger.info("Importing ESS-DIVE packages...") @@ -435,7 +461,7 @@ def main(): total_imported += count except Exception as e: logger.error(f"Failed to import ESS-DIVE packages: {e}") - + if files_valid[2]: try: logger.info("Importing NMDC biosamples...") @@ -443,30 +469,34 @@ def main(): total_imported += count except Exception as e: logger.error(f"Failed to import NMDC biosamples: {e}") - + # Import JGI GOLD files unless skipped if not args.skip_large_files: if files_valid[3]: try: - logger.info("Importing JGI GOLD biosamples (large file, this may take a while)...") + logger.info( + "Importing JGI GOLD biosamples (large file, this may take a while)..." + ) count = importer.import_jgi_gold_biosamples(jgi_biosample_file) total_imported += count except Exception as e: logger.error(f"Failed to import JGI GOLD biosamples: {e}") - + if files_valid[4]: try: - logger.info("Importing JGI GOLD organisms (large file, this may take a while)...") + logger.info( + "Importing JGI GOLD organisms (large file, this may take a while)..." + ) count = importer.import_jgi_gold_organisms(jgi_organism_file) total_imported += count except Exception as e: logger.error(f"Failed to import JGI GOLD organisms: {e}") else: logger.info("Skipping large JGI GOLD files as requested") - + # Close connection importer.close() - + logger.info(f"Import process completed. Total records imported: {total_imported}") return 0 diff --git a/mongodb/legacy/geo_query.py b/mongodb/legacy/geo_query.py index e806721..002848a 100644 --- a/mongodb/legacy/geo_query.py +++ b/mongodb/legacy/geo_query.py @@ -2,7 +2,7 @@ """ Geospatial Query Tool for MongoDB -This script provides utilities for querying geospatial data +This script provides utilities for querying geospatial data imported into MongoDB by the geo_importer.py script. Usage: @@ -21,229 +21,255 @@ # Configure logging logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) -logger = logging.getLogger('geo-query') +logger = logging.getLogger("geo-query") class GeoQuery: """MongoDB geospatial data query utilities.""" - + def __init__(self, connection_string: str = "mongodb://localhost:27017"): """Initialize MongoDB connection. - + Args: connection_string: MongoDB connection URI """ self.client = MongoClient(connection_string) self.db = self.client.geospatialDB self.collection = self.db.locations - + def get_stats(self) -> Dict[str, Any]: """Get statistics about the data in the collection. - + Returns: Dictionary with statistics """ logger.info("Retrieving collection statistics") - + total = self.collection.count_documents({}) - + # Count by dataset type - emsl_count = self.collection.count_documents({ - 'dataset_id': {'$regex': '^emsl'} - }) - - ess_dive_count = self.collection.count_documents({ - 'dataset_id': {'$regex': '^ess-dive'} - }) - - nmdc_count = self.collection.count_documents({ - 'dataset_id': {'$regex': '^nmdc:'} - }) - - jgi_count = self.collection.count_documents({ - 'dataset_id': {'$regex': '^jgi:'} - }) + emsl_count = self.collection.count_documents( + {"dataset_id": {"$regex": "^emsl"}} + ) + + ess_dive_count = self.collection.count_documents( + {"dataset_id": {"$regex": "^ess-dive"}} + ) + + nmdc_count = self.collection.count_documents( + {"dataset_id": {"$regex": "^nmdc:"}} + ) + + jgi_count = self.collection.count_documents({"dataset_id": {"$regex": "^jgi:"}}) # Get bounding box - bounds = list(self.collection.aggregate([ - { - '$group': { - '_id': None, - 'minLat': {'$min': {'$arrayElemAt': ['$coordinates.coordinates', 1]}}, - 'maxLat': {'$max': {'$arrayElemAt': ['$coordinates.coordinates', 1]}}, - 'minLng': {'$min': {'$arrayElemAt': ['$coordinates.coordinates', 0]}}, - 'maxLng': {'$max': {'$arrayElemAt': ['$coordinates.coordinates', 0]}} - } - } - ])) - + bounds = list( + self.collection.aggregate( + [ + { + "$group": { + "_id": None, + "minLat": { + "$min": { + "$arrayElemAt": ["$coordinates.coordinates", 1] + } + }, + "maxLat": { + "$max": { + "$arrayElemAt": ["$coordinates.coordinates", 1] + } + }, + "minLng": { + "$min": { + "$arrayElemAt": ["$coordinates.coordinates", 0] + } + }, + "maxLng": { + "$max": { + "$arrayElemAt": ["$coordinates.coordinates", 0] + } + }, + } + } + ] + ) + ) + boundary = bounds[0] if bounds else None - + return { - 'total': total, - 'dataset_counts': { - 'proposals': proposal_count, - 'ess_dive': ess_dive_count, - 'nmdc': nmdc_count, - 'nmdc': jgi_count, - 'other': total - (proposal_count + ess_dive_count + nmdc_count) + "total": total, + "dataset_counts": { + "proposals": proposal_count, + "ess_dive": ess_dive_count, + "nmdc": nmdc_count, + "nmdc": jgi_count, + "other": total - (proposal_count + ess_dive_count + nmdc_count), }, - 'bounds': { - 'south': boundary['minLat'], - 'north': boundary['maxLat'], - 'west': boundary['minLng'], - 'east': boundary['maxLng'] - } if boundary else None + "bounds": { + "south": boundary["minLat"], + "north": boundary["maxLat"], + "west": boundary["minLng"], + "east": boundary["maxLng"], + } + if boundary + else None, } - def find_by_system(self, system_name: str, limit: int = 1000) -> List[Dict[str, Any]]: + def find_by_system( + self, system_name: str, limit: int = 1000 + ) -> List[Dict[str, Any]]: """Find all points from a specific system. - + Args: system_name: The system name to search for limit: Maximum number of results to return - + Returns: List of matching documents """ logger.info(f"Searching for system: {system_name}") - - cursor = self.collection.find({'system_name': system_name}).limit(limit) + + cursor = self.collection.find({"system_name": system_name}).limit(limit) return list(cursor) - - + def find_by_dataset(self, dataset_id: str) -> List[Dict[str, Any]]: """Find all points in a specific dataset. - + Args: dataset_id: The dataset ID to search for - + Returns: List of matching documents """ logger.info(f"Searching for dataset: {dataset_id}") - - cursor = self.collection.find({'dataset_id': dataset_id}) + + cursor = self.collection.find({"dataset_id": dataset_id}) return list(cursor) - - def find_in_box(self, west: float, south: float, east: float, north: float, - limit: int = 1000) -> List[Dict[str, Any]]: + + def find_in_box( + self, west: float, south: float, east: float, north: float, limit: int = 1000 + ) -> List[Dict[str, Any]]: """Find points within a bounding box. - + Args: west: Western longitude south: Southern latitude east: Eastern longitude north: Northern latitude limit: Maximum number of results to return - + Returns: List of documents within the bounding box """ logger.info(f"Searching within box: W:{west}, S:{south}, E:{east}, N:{north}") - + query = { - 'coordinates': { - '$geoWithin': { - '$geometry': { - 'type': 'Polygon', - 'coordinates': [[ - [west, south], - [east, south], - [east, north], - [west, north], - [west, south] - ]] + "coordinates": { + "$geoWithin": { + "$geometry": { + "type": "Polygon", + "coordinates": [ + [ + [west, south], + [east, south], + [east, north], + [west, north], + [west, south], + ] + ], } } } } - + cursor = self.collection.find(query).limit(limit) return list(cursor) - - def find_nearby(self, lat: float, lng: float, - distance: int = 10000, limit: int = 100) -> List[Dict[str, Any]]: + + def find_nearby( + self, lat: float, lng: float, distance: int = 10000, limit: int = 100 + ) -> List[Dict[str, Any]]: """Find points near a specific location. - + Args: lat: Latitude lng: Longitude distance: Maximum distance in meters limit: Maximum number of results to return - + Returns: List of nearby documents """ logger.info(f"Searching near point ({lat}, {lng}) within {distance}m") - + query = { - 'coordinates': { - '$near': { - '$geometry': { - 'type': 'Point', - 'coordinates': [lng, lat] - }, - '$maxDistance': distance + "coordinates": { + "$near": { + "$geometry": {"type": "Point", "coordinates": [lng, lat]}, + "$maxDistance": distance, } } } - + cursor = self.collection.find(query).limit(limit) return list(cursor) - - def create_map(self, points: List[Dict[str, Any]], - output_file: str = 'geo_map.html') -> None: + + def create_map( + self, points: List[Dict[str, Any]], output_file: str = "geo_map.html" + ) -> None: """Create an interactive map visualization of points. - + Args: points: List of documents with coordinates output_file: Path to save the HTML map file """ logger.info(f"Creating map with {len(points)} points") - + if not points: logger.warning("No points to visualize") return - + # Calculate center point - lats = [p['coordinates']['coordinates'][1] for p in points if 'coordinates' in p] - lngs = [p['coordinates']['coordinates'][0] for p in points if 'coordinates' in p] - + lats = [ + p["coordinates"]["coordinates"][1] for p in points if "coordinates" in p + ] + lngs = [ + p["coordinates"]["coordinates"][0] for p in points if "coordinates" in p + ] + if not lats or not lngs: logger.warning("No valid coordinates found") return - + center_lat = sum(lats) / len(lats) center_lng = sum(lngs) / len(lngs) - + # Create map m = folium.Map(location=[center_lat, center_lng], zoom_start=4) - + # Add marker cluster marker_cluster = MarkerCluster().add_to(m) - + # Add markers for point in points: - if 'coordinates' not in point: + if "coordinates" not in point: continue - - coords = point['coordinates']['coordinates'] + + coords = point["coordinates"]["coordinates"] if len(coords) < 2: continue - + # Get point details - dataset_id = point.get('dataset_id', 'Unknown') - system_name = point.get('system_name', 'Unknown') - + dataset_id = point.get("dataset_id", "Unknown") + system_name = point.get("system_name", "Unknown") + # Get metadata if available - metadata = point.get('metadata', {}) - description = metadata.get('description', '') - source = metadata.get('source', 'Unknown source') - + metadata = point.get("metadata", {}) + description = metadata.get("description", "") + source = metadata.get("source", "Unknown source") + # Create popup content popup_content = f""" Dataset: {dataset_id}
@@ -251,62 +277,63 @@ def create_map(self, points: List[Dict[str, Any]], Coordinates: {coords[1]}, {coords[0]}
Source: {source}
""" - + if description: popup_content += f"Description: {description}
" - + # Add marker folium.Marker( location=[coords[1], coords[0]], popup=folium.Popup(popup_content, max_width=300), - tooltip=system_name + tooltip=system_name, ).add_to(marker_cluster) - + # Save map m.save(output_file) logger.info(f"Map saved to {output_file}") - - def export_to_csv(self, points: List[Dict[str, Any]], - output_file: str = 'geo_data.csv') -> None: + + def export_to_csv( + self, points: List[Dict[str, Any]], output_file: str = "geo_data.csv" + ) -> None: """Export query results to CSV. - + Args: points: List of documents output_file: Path to save the CSV file """ logger.info(f"Exporting {len(points)} points to CSV") - + if not points: logger.warning("No points to export") return - + # Prepare data for DataFrame rows = [] for point in points: row = { - 'dataset_id': point.get('dataset_id', ''), - 'system_name': point.get('system_name', '') + "dataset_id": point.get("dataset_id", ""), + "system_name": point.get("system_name", ""), } - + # Add coordinates - if 'coordinates' in point and 'coordinates' in point['coordinates']: - coords = point['coordinates']['coordinates'] + if "coordinates" in point and "coordinates" in point["coordinates"]: + coords = point["coordinates"]["coordinates"] if len(coords) >= 2: - row['longitude'] = coords[0] - row['latitude'] = coords[1] - + row["longitude"] = coords[0] + row["latitude"] = coords[1] + # Add metadata fields - metadata = point.get('metadata', {}) + metadata = point.get("metadata", {}) for key, value in metadata.items(): - row[f'metadata_{key}'] = value - + row[f"metadata_{key}"] = value + rows.append(row) - + # Create DataFrame and export df = pd.DataFrame(rows) df.to_csv(output_file, index=False) logger.info(f"Data exported to {output_file}") - + def close(self) -> None: """Close the MongoDB connection.""" self.client.close() @@ -315,151 +342,167 @@ def close(self) -> None: def main(): """Main function to run queries.""" - parser = argparse.ArgumentParser(description='Query geospatial data from MongoDB') - parser.add_argument('--mongodb-uri', type=str, default='mongodb://localhost:27017', - help='MongoDB connection string') - parser.add_argument('--action', type=str, required=True, - choices=['stats', 'dataset', 'system', 'box', 'nearby', 'map'], - help='Query action to perform') - + parser = argparse.ArgumentParser(description="Query geospatial data from MongoDB") + parser.add_argument( + "--mongodb-uri", + type=str, + default="mongodb://localhost:27017", + help="MongoDB connection string", + ) + parser.add_argument( + "--action", + type=str, + required=True, + choices=["stats", "dataset", "system", "box", "nearby", "map"], + help="Query action to perform", + ) + # Parameters for different query types - parser.add_argument('--system-name', type=str, - help='System name for system queries') - parser.add_argument('--dataset-id', type=str, - help='Dataset ID for dataset queries') - parser.add_argument('--lat', type=float, - help='Latitude for nearby queries') - parser.add_argument('--lng', type=float, - help='Longitude for nearby queries') - parser.add_argument('--distance', type=int, default=10000, - help='Distance in meters for nearby queries') - parser.add_argument('--west', type=float, - help='Western longitude for box queries') - parser.add_argument('--south', type=float, - help='Southern latitude for box queries') - parser.add_argument('--east', type=float, - help='Eastern longitude for box queries') - parser.add_argument('--north', type=float, - help='Northern latitude for box queries') - parser.add_argument('--limit', type=int, default=100000, - help='Maximum number of results') - parser.add_argument('--output', type=str, default='output', - help='Output file name prefix (without extension)') - parser.add_argument('--format', type=str, choices=['json', 'csv', 'map'], default='json', - help='Output format') - + parser.add_argument( + "--system-name", type=str, help="System name for system queries" + ) + parser.add_argument("--dataset-id", type=str, help="Dataset ID for dataset queries") + parser.add_argument("--lat", type=float, help="Latitude for nearby queries") + parser.add_argument("--lng", type=float, help="Longitude for nearby queries") + parser.add_argument( + "--distance", + type=int, + default=10000, + help="Distance in meters for nearby queries", + ) + parser.add_argument("--west", type=float, help="Western longitude for box queries") + parser.add_argument("--south", type=float, help="Southern latitude for box queries") + parser.add_argument("--east", type=float, help="Eastern longitude for box queries") + parser.add_argument("--north", type=float, help="Northern latitude for box queries") + parser.add_argument( + "--limit", type=int, default=100000, help="Maximum number of results" + ) + parser.add_argument( + "--output", + type=str, + default="output", + help="Output file name prefix (without extension)", + ) + parser.add_argument( + "--format", + type=str, + choices=["json", "csv", "map"], + default="json", + help="Output format", + ) + args = parser.parse_args() - + # Initialize query object query = GeoQuery(args.mongodb_uri) - + try: # Perform the requested action - if args.action == 'stats': + if args.action == "stats": # Get collection statistics stats = query.get_stats() print(json.dumps(stats, indent=2)) - + # Save to file if requested - if args.format == 'json': - with open(f"{args.output}.json", 'w') as f: + if args.format == "json": + with open(f"{args.output}.json", "w") as f: json.dump(stats, f, indent=2) logger.info(f"Statistics saved to {args.output}.json") - - elif args.action == 'dataset': + + elif args.action == "dataset": # Validate parameters if not args.dataset_id: logger.error("Missing dataset-id parameter") return 1 - + # Query by dataset ID results = query.find_by_dataset(args.dataset_id) logger.info(f"Found {len(results)} records for dataset {args.dataset_id}") - + # Output results - if args.format == 'json': - with open(f"{args.output}.json", 'w') as f: + if args.format == "json": + with open(f"{args.output}.json", "w") as f: json.dump(results, f, indent=2, default=str) logger.info(f"Results saved to {args.output}.json") - elif args.format == 'csv': + elif args.format == "csv": query.export_to_csv(results, f"{args.output}.csv") - elif args.format == 'map': + elif args.format == "map": query.create_map(results, f"{args.output}.html") - elif args.action == 'system': + elif args.action == "system": # Validate parameters if not args.system_name: logger.error("Missing system-name parameter") return 1 - + # Query by system name results = query.find_by_system(args.system_name, args.limit) logger.info(f"Found {len(results)} records for system {args.system_name}") - + # Output results - if args.format == 'json': - with open(f"{args.output}.json", 'w') as f: + if args.format == "json": + with open(f"{args.output}.json", "w") as f: json.dump(results, f, indent=2, default=str) logger.info(f"Results saved to {args.output}.json") - elif args.format == 'csv': + elif args.format == "csv": query.export_to_csv(results, f"{args.output}.csv") - elif args.format == 'map': - query.create_map(results, f"{args.output}.html") - - elif args.action == 'box': + elif args.format == "map": + query.create_map(results, f"{args.output}.html") + + elif args.action == "box": # Validate parameters if None in [args.west, args.south, args.east, args.north]: - logger.error("Missing bounding box parameters (west, south, east, north)") + logger.error( + "Missing bounding box parameters (west, south, east, north)" + ) return 1 - + # Query within bounding box results = query.find_in_box( args.west, args.south, args.east, args.north, args.limit ) logger.info(f"Found {len(results)} records in bounding box") - + # Output results - if args.format == 'json': - with open(f"{args.output}.json", 'w') as f: + if args.format == "json": + with open(f"{args.output}.json", "w") as f: json.dump(results, f, indent=2, default=str) logger.info(f"Results saved to {args.output}.json") - elif args.format == 'csv': + elif args.format == "csv": query.export_to_csv(results, f"{args.output}.csv") - elif args.format == 'map': + elif args.format == "map": query.create_map(results, f"{args.output}.html") - - elif args.action == 'nearby': + + elif args.action == "nearby": # Validate parameters if None in [args.lat, args.lng]: logger.error("Missing location parameters (lat, lng)") return 1 - + # Query nearby points - results = query.find_nearby( - args.lat, args.lng, args.distance, args.limit - ) + results = query.find_nearby(args.lat, args.lng, args.distance, args.limit) logger.info(f"Found {len(results)} records near ({args.lat}, {args.lng})") - + # Output results - if args.format == 'json': - with open(f"{args.output}.json", 'w') as f: + if args.format == "json": + with open(f"{args.output}.json", "w") as f: json.dump(results, f, indent=2, default=str) logger.info(f"Results saved to {args.output}.json") - elif args.format == 'csv': + elif args.format == "csv": query.export_to_csv(results, f"{args.output}.csv") - elif args.format == 'map': + elif args.format == "map": query.create_map(results, f"{args.output}.html") - - elif args.action == 'map': + + elif args.action == "map": # Create a map with all points (limited by --limit) results = list(query.collection.find().limit(args.limit)) logger.info(f"Found {len(results)} records for map") query.create_map(results, f"{args.output}.html") - + finally: # Close connection query.close() - + return 0 diff --git a/pyproject.toml b/pyproject.toml index 98b503c..da4616e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,25 +24,31 @@ dependencies = [ "bertron-schema @ git+https://github.com/ber-data/bertron-schema.git", # "dtspy @ https://github.com/kbase/dtspy/archive/730828cff3924fc4b2215fe5c1b67bc04aad377f.tar.gz", "fastapi[standard]>=0.115.12", + # `httpx` is a dependency of FastAPI's `TestClient` class, which we use + # in the server test suite. It is also a dependency of `mongodb/ingest_data.py`, + # which is why we currently list it as a non-dev dependency. + "httpx>=0.28.1", "jsonschema>=4.0.0", "nmdc-api-utilities>=0.3.9", "pydantic-settings>=2.10.1", "pymongo>=4.13.1", - "pytest>=8.4.0", "uvicorn>=0.34.3", ] [dependency-groups] dev = [ - # `httpx` is a dependency of FastAPI's `TestClient` class. - # Docs: https://fastapi.tiangolo.com/tutorial/testing/#using-testclient - "httpx>=0.28.1", "pre-commit>=4.1.0", "pyright>=1.1.386", - "pytest>=8.3.5", + "pytest>=8.4.1", "ruff>=0.9.9", ] [tool.pyright] venvPath = "." venv = ".venv" + +# Configure pytest. +# Docs: https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml +[tool.pytest.ini_options] +# Configure pytest to run doctests, and to ignore directories that contain currently-broken modules. +addopts = "--doctest-modules --ignore='src/bertron/' --ignore='mongodb/legacy/'" diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bertron_client.py b/src/bertron_client.py index 07fc788..5da78af 100644 --- a/src/bertron_client.py +++ b/src/bertron_client.py @@ -6,7 +6,7 @@ Provides methods to query and retrieve entity data from the BER data sources. """ -import requests +import requests # FIXME: `requests` is not listed as a dependency in `pyproject.toml` from typing import List, Dict, Any, Optional from dataclasses import dataclass import logging diff --git a/src/models.py b/src/models.py index 4770658..e1c77b3 100644 --- a/src/models.py +++ b/src/models.py @@ -1,5 +1,71 @@ +from typing import Any, Dict, Optional, List + from pydantic import BaseModel, ConfigDict, Field -from typing import Optional + +from schema.datamodel.bertron_schema_pydantic import Entity + + +class MongoFindQueryDescriptor(BaseModel): + r""" + A model representing a MongoDB find query, including the filter, the projection, + and some additional options. + + Reference: https://www.mongodb.com/docs/manual/reference/method/db.collection.find/ + """ + + filter: Dict[str, Any] = Field( + default={}, + description="MongoDB find query filter", + ) + projection: Optional[Dict[str, Any]] = Field( + default=None, + description="Fields to include or exclude", + ) + skip: Optional[int] = Field( + default=0, + ge=0, + description="Number of documents to skip", + ) + limit: Optional[int] = Field( + default=100, + ge=1, + le=1000, # TODO: Was this chosen arbitrarily? + description="Maximum number of documents to return", + ) + sort: Optional[Dict[str, int]] = Field( + default=None, + description="Sort criteria (1 for ascending, -1 for descending)", + ) + + +class EntitiesResponse(BaseModel): + r"""A response containing a list of entities and count.""" + + documents: List[Entity] = Field( + ..., + title="Entity documents", + description="List of entities returned by the query", + ) + count: int = Field( + ..., + title="Entity count", + description="Total number of entities returned", + ) + + +class FindResponse(BaseModel): + r"""A response containing a list of dicts and count.""" + + documents: List = Field( + ..., + title="Documents", + description="List of Documents returned by the query", + ) + count: int = Field( + ..., + title="Document count", + description="Total number of documents returned", + ) class HealthResponse(BaseModel): diff --git a/src/server.py b/src/server.py index acc84b5..59dbe37 100644 --- a/src/server.py +++ b/src/server.py @@ -1,16 +1,22 @@ import logging -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Union from fastapi import FastAPI, HTTPException, Query from fastapi.responses import RedirectResponse from pymongo import MongoClient -from pydantic import BaseModel, Field -from schema.datamodel import bertron_schema_pydantic +from schema.datamodel.bertron_schema_pydantic import Entity import uvicorn -from lib.helpers import get_package_version -from models import HealthResponse, VersionResponse from config import settings as cfg +from lib.helpers import get_package_version +from src.models import ( + EntitiesResponse, + FindResponse, + HealthResponse, + MongoFindQueryDescriptor, + VersionResponse, +) + # Set up logging logger = logging.getLogger(__name__) @@ -60,7 +66,7 @@ def get_version() -> VersionResponse: @app.get("/bertron") -def get_all_entities(): +def get_all_entities() -> EntitiesResponse: r"""Get all documents from the entities collection.""" db = mongo_client[cfg.mongo_database] @@ -74,31 +80,20 @@ def get_all_entities(): # Convert documents to Entity objects entities = [] for doc in documents: - entities.append(convert_document_to_entity(doc)) - - return {"documents": entities, "count": len(entities)} - + entities.append(Entity(**clean_document(doc))) -class MongoDBQuery(BaseModel): - filter: Dict[str, Any] = Field(default={}, description="MongoDB find query filter") - projection: Optional[Dict[str, Any]] = Field( - default=None, description="Fields to include or exclude" - ) - skip: Optional[int] = Field( - default=0, ge=0, description="Number of documents to skip" - ) - limit: Optional[int] = Field( - default=100, ge=1, le=1000, description="Maximum number of documents to return" - ) - sort: Optional[Dict[str, int]] = Field( - default=None, description="Sort criteria (1 for ascending, -1 for descending)" - ) + return EntitiesResponse(documents=entities, count=len(entities)) @app.post("/bertron/find") -def find_entities(query: MongoDBQuery): +def find_entities( + query: MongoFindQueryDescriptor, +) -> Union[EntitiesResponse, FindResponse]: r"""Execute a MongoDB find operation on the entities collection with filter, projection, skip, limit, and sort options. + Returns EntitiesResponse (validated Entity objects) when no projection is specified, + or FindResponse (raw documents) when projection is used. + Example query body: { "filter": {"field": "value", "number_field": {"$gt": 100}}, @@ -128,13 +123,27 @@ def find_entities(query: MongoDBQuery): if query.limit: cursor = cursor.limit(query.limit) - # Convert cursor to list and convert to Entity objects + # Convert cursor to list documents = list(cursor) - entities = [] - for doc in documents: - entities.append(convert_document_to_entity(doc)) - return {"documents": entities, "count": len(entities)} + # Return different response types based on whether projection is used + if query.projection: + # When projection is used, return raw documents as FindResponse + # Remove MongoDB internal fields + cleaned_documents = [] + for doc in documents: + cleaned_documents.append(clean_document(doc)) + + return FindResponse( + documents=cleaned_documents, count=len(cleaned_documents) + ) + else: + # When no projection, return validated Entity objects as EntitiesResponse + entities = [] + for doc in documents: + entities.append(Entity(**clean_document(doc))) + + return EntitiesResponse(documents=entities, count=len(entities)) except Exception as e: raise HTTPException(status_code=400, detail=f"Query error: {str(e)}") @@ -149,7 +158,7 @@ def find_nearby_entities( ..., ge=-180, le=180, description="Center longitude in degrees" ), radius_meters: float = Query(..., gt=0, description="Search radius in meters"), -): +) -> EntitiesResponse: r"""Find entities within a specified radius of a geographic point using MongoDB's $near operator. This endpoint uses MongoDB's geospatial $near query which requires a 2dsphere index @@ -189,9 +198,9 @@ def find_nearby_entities( documents = list(cursor) entities = [] for doc in documents: - entities.append(convert_document_to_entity(doc)) + entities.append(Entity(**clean_document(doc))) - return {"documents": entities, "count": len(entities)} + return EntitiesResponse(documents=entities, count=len(entities)) except Exception as e: raise HTTPException(status_code=400, detail=f"Nearby query error: {str(e)}") @@ -211,7 +220,7 @@ def find_entities_in_bounding_box( northeast_lng: float = Query( ..., ge=-180, le=180, description="Northeast corner longitude" ), -): +) -> EntitiesResponse: r"""Find entities within a bounding box using MongoDB's $geoWithin operator. This endpoint finds all entities whose coordinates fall within the specified @@ -262,9 +271,9 @@ def find_entities_in_bounding_box( documents = list(cursor) entities = [] for doc in documents: - entities.append(convert_document_to_entity(doc)) + entities.append(Entity(**clean_document(doc))) - return {"documents": entities, "count": len(entities)} + return EntitiesResponse(documents=entities, count=len(entities)) except Exception as e: raise HTTPException( @@ -272,8 +281,8 @@ def find_entities_in_bounding_box( ) -@app.get("/bertron/{id}", response_model=bertron_schema_pydantic.Entity) -def get_entity_by_id(id: str): +@app.get("/bertron/{id:path}") +def get_entity_by_id(id: str) -> Optional[Entity]: r"""Get a single entity by its ID. Example: /bertron/emsl:12345 @@ -297,7 +306,7 @@ def get_entity_by_id(id: str): # Validate and create Entity instance try: - entity = convert_document_to_entity(document) + entity = Entity(**clean_document(document)) return entity except Exception as validation_error: logger.error(f"Entity validation failed for id '{id}': {validation_error}") @@ -313,16 +322,30 @@ def get_entity_by_id(id: str): raise HTTPException(status_code=400, detail=f"Query error: {str(e)}") -def convert_document_to_entity( +def clean_document( document: Dict[str, Any], -) -> Optional[bertron_schema_pydantic.Entity]: - """Convert a MongoDB document to an Entity object.""" - # Remove MongoDB _id, metadata, geojson - document.pop("_id", None) - document.pop("_metadata", None) - document.pop("geojson", None) - - return bertron_schema_pydantic.Entity(**document) +) -> Dict[str, Any]: + """ + Removes fields from the MongoDB document, that don't exist on the `Entity` model. + + This function was designed to remove the `_id`, `_metadata`, and `geojson` fields + from the document. + + >>> clean_document({"_id": "123", "_metadata": {}, "geojson": {}, "name": "Test"}) + {'name': 'Test'} + >>> clean_document({}) + {} + """ + + # Determine the names of the fields that the Entity model has. + model_field_names = Entity.model_fields.keys() + + # Remove all _other_ fields from the document. + for key in list(document.keys()): + if key not in model_field_names: + document.pop(key) + + return document if __name__ == "__main__": diff --git a/src/tests/conftest.py b/src/tests/conftest.py deleted file mode 100644 index 25d5e9a..0000000 --- a/src/tests/conftest.py +++ /dev/null @@ -1,35 +0,0 @@ -r""" -This module contains `pytest` fixture definitions that `pytest` will automatically make available -to all tests within this directory and its descendant directories. - -From the `pytest` documentation: -> The `conftest.py` file serves as a means of providing fixtures for an entire directory. -> Fixtures defined in a `conftest.py` can be used by any test in that package without -> needing to import them (`pytest` will automatically discover them). -Source: https://docs.pytest.org/en/stable/reference/fixtures.html#conftest-py-sharing-fixtures-across-multiple-files -""" - -import pytest - -from config import settings as cfg - - -# Note: We use `autouse=True` so that this fixture is automatically applied to each test -# within its scope (since we are in a `conftest.py` file, its scope consists of -# the current directory and all descendant directories). -@pytest.fixture(autouse=True) -def patched_cfg(): - r""" - A `pytest` fixture that temporarily patches the application configuration - so it references a test database. - """ - - test_database_name = "bertron_test" - main_database_name = cfg.mongo_database - assert main_database_name != test_database_name, ( - "The main database name matches the test database name. " - "Reconfigure your environment to ensure they differ." - ) - cfg.mongo_database = test_database_name - yield cfg - cfg.mongo_database = main_database_name diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..37e7582 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,51 @@ +r""" +This module contains `pytest` fixture definitions that `pytest` will automatically make available +to all tests within this directory and its descendant directories. + +From the `pytest` documentation: +> The `conftest.py` file serves as a means of providing fixtures for an entire directory. +> Fixtures defined in a `conftest.py` can be used by any test in that package without +> needing to import them (`pytest` will automatically discover them). +Source: https://docs.pytest.org/en/stable/reference/fixtures.html#conftest-py-sharing-fixtures-across-multiple-files +""" + +import pytest + +from src.config import settings + + +# Note: We use `autouse=True` so that this fixture is automatically applied to each test +# within its scope (since we are in a `conftest.py` file, its scope consists of +# the current directory and all descendant directories). +@pytest.fixture(autouse=True) +def patched_config(monkeypatch): + r""" + A `pytest` fixture that temporarily patches the application configuration + so it references a test database. + + From the pytest documentation: + > `monkeypatch.setattr` works by (temporarily) changing the object that a name points to + > with another one. There can be many names pointing to any individual object, so for + > patching to work you must ensure that you patch the name used by the system under test. + Source: https://docs.pytest.org/en/stable/reference/reference.html#pytest.MonkeyPatch.setattr + + Also from the pytest documentation: + > All modifications will be undone after the requesting test function or fixture has finished. + """ + + # First, we do a safety check to ensure that the test database is distinct from the main one. + main_database_name = settings.mongo_database + test_database_name = "bertron_test" + assert main_database_name != test_database_name, ( + "The main database name matches the test database name. " + "Reconfigure your environment to ensure they differ." + ) + + # Then, we patch the config object so it references the test database. + # Note: Different modules import the config object using different `import` paths. + monkeypatch.setattr("config.settings.mongo_database", test_database_name) + monkeypatch.setattr("src.config.settings.mongo_database", test_database_name) + + # Finally, we yield control to the test that depends on this fixture. + # Note: After the test completes, `monkeypatch` will automatically un-patch things. + yield diff --git a/tests/data/emsl-example.json b/tests/data/emsl-example.json new file mode 100644 index 0000000..d6bd0ab --- /dev/null +++ b/tests/data/emsl-example.json @@ -0,0 +1,20 @@ +{ + "ber_data_source": "EMSL", + "coordinates": { + "latitude": 34, + "longitude": 118.0, + "altitude": null, + "depth": null, + "elevation": null + }, + "entity_type": [ + "sample" + ], + "description": "Clostridium thermocellum protein extracts", + "id": "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488", + "name": "EMSL Sample c9405190-e962-4ba5-93f0-e3ff499f4488", + "alt_ids": null, + "alt_names": null, + "part_of_collection": null, + "uri": "https://sc-data.emsl.pnnl.gov/?projectId=61815" +} diff --git a/tests/data/ess-dive-example.json b/tests/data/ess-dive-example.json new file mode 100644 index 0000000..9e80b3f --- /dev/null +++ b/tests/data/ess-dive-example.json @@ -0,0 +1,68 @@ +[ + { + "ber_data_source": "ESS-DIVE", + "coordinates": { + "latitude": 65.162309, + "longitude": -164.819851, + "altitude": null, + "depth": null, + "elevation": null + }, + "entity_type": [ + "unspecified" + ], + "description": "Maps of land surface phenology derived from PlanetScope data, 2018-2022, Teller, Kougarok, and Council, Seward Peninsula", + "id": "doi:10.15485/2441497", + "name": "NGEE Arctic Kougarok Site, Mile Marker 64, Alaska", + "alt_ids": [ + "NGA547" + ], + "alt_names": null, + "part_of_collection": [], + "uri": "https://data.ess-dive.lbl.gov/view/doi:10.15485/2441497" + }, + { + "ber_data_source": "ESS-DIVE", + "coordinates": { + "latitude": 64.735492, + "longitude": -165.95039, + "altitude": null, + "depth": null, + "elevation": null + }, + "entity_type": [ + "unspecified" + ], + "description": "Maps of land surface phenology derived from PlanetScope data, 2018-2022, Teller, Kougarok, and Council, Seward Peninsula", + "id": "doi:10.15485/2441497", + "name": "NGEE Arctic Teller Site, Mile Marker 27, Alaska", + "alt_ids": [ + "NGA547" + ], + "alt_names": null, + "part_of_collection": [], + "uri": "https://data.ess-dive.lbl.gov/view/doi:10.15485/2441497" + }, + { + "ber_data_source": "ESS-DIVE", + "coordinates": { + "latitude": 64.847286, + "longitude": -163.71993600000002, + "altitude": null, + "depth": null, + "elevation": null + }, + "entity_type": [ + "unspecified" + ], + "description": "Maps of land surface phenology derived from PlanetScope data, 2018-2022, Teller, Kougarok, and Council, Seward Peninsula", + "id": "doi:10.15485/2441497", + "name": "NGEE Arctic Council Site, Mile Marker 71, Alaska", + "alt_ids": [ + "NGA547" + ], + "alt_names": null, + "part_of_collection": [], + "uri": "https://data.ess-dive.lbl.gov/view/doi:10.15485/2441497" + } +] \ No newline at end of file diff --git a/tests/data/gold-example.json b/tests/data/gold-example.json new file mode 100644 index 0000000..3abe969 --- /dev/null +++ b/tests/data/gold-example.json @@ -0,0 +1,34 @@ +{ + "ber_data_source": "JGI", + "coordinates": { + "latitude": 44.7523206, + "longitude": -110.7253926, + "altitude": null, + "depth": null, + "elevation": { + "has_numeric_value": 2280, + "has_unit": "meter (UO:0000008)" + } + }, + "entity_type": [ + "jgi_biosample" + ], + "description": "Small acidic pool on hillside north of Nymph Lake.", + "id": "Gb0051341", + "name": "Hot spring microbial communities from Yellowstone National Park, Wyoming, USA - YNP2 Nymph Lake 10", + "alt_ids": [ + "NCBITaxon:433727" + ], + "alt_names": [ + { + "name": "GOLD biosample ID Gb0051341", + "name_type": "exact_synonym" + }, + { + "name": "hot springs metagenome", + "name_type": "broad_synonym" + } + ], + "part_of_collection": [], + "uri": "https://gold.jgi.doe.gov/biosample?id=Gb0051341" +} diff --git a/tests/data/monet-example.json b/tests/data/monet-example.json new file mode 100644 index 0000000..1e9044c --- /dev/null +++ b/tests/data/monet-example.json @@ -0,0 +1,23 @@ +{ + "ber_data_source": "MONET", + "coordinates": { + "latitude": 68.633578, + "longitude": -149.632826, + "altitude": null, + "depth": null, + "elevation": { + "has_numeric_value": 722.613, + "has_unit": "unknown" + } + }, + "entity_type": [ + "sample" + ], + "description": null, + "id": "MONET:072e85bf-4a43-4212-83dc-108bb262620c", + "name": "MONet Core 60920_7", + "alt_ids": null, + "alt_names": null, + "part_of_collection": null, + "uri": "https://sc-data.emsl.pnnl.gov/monet" +} \ No newline at end of file diff --git a/tests/data/nmdc-example.json b/tests/data/nmdc-example.json new file mode 100644 index 0000000..8a9e766 --- /dev/null +++ b/tests/data/nmdc-example.json @@ -0,0 +1,28 @@ +{ + "ber_data_source": "NMDC", + "coordinates": { + "latitude": 28.125842, + "longitude": -81.434174, + "altitude": null, + "depth": { + "has_minimum_numeric_value": 0, + "has_maximum_numeric_value": 0.1, + "has_unit": "m", + "has_raw_value": "0 - 0.1m" + }, + "elevation": { + "has_numeric_value": 24, + "has_unit": "m" + } + }, + "entity_type": [ + "sample" + ], + "description": "MONet sample represented in NMDC", + "id": "nmdc:bsm-11-bsf8yq62", + "name": "DSNY_CoreB_TOP", + "alt_ids": null, + "alt_names": null, + "part_of_collection": null, + "uri": "https://api.microbiomedata.org/biosamples/nmdc%3Absm-11-bsf8yq62" +} \ No newline at end of file diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..04c52fd --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,454 @@ +import sys +from typing import Dict, Any +from unittest.mock import patch + +from fastapi.testclient import TestClient +from pymongo import MongoClient +from pymongo.database import Database +import pytest +from starlette import status + +from src.config import settings as cfg +from src.server import app +from mongodb.ingest_data import main as ingest_main + + +@pytest.fixture +def test_client(): + test_client = TestClient(app) + yield test_client + + +@pytest.fixture +def seeded_db(): + r"""Yields a database seeded using (effectively) the `ingest` script.""" + + # Get a reference to the test database. + mongo_client = MongoClient( + host=cfg.mongo_host, + port=cfg.mongo_port, + username=cfg.mongo_username, + password=cfg.mongo_password, + ) + db = mongo_client[cfg.mongo_database] + + # Drop the test database. + mongo_client.drop_database(cfg.mongo_database) + + # Invoke the standard `ingest` script to populate the test database. + # + # Note: We patch `sys.argv` so that the script can run as if it + # were invoked from the command line. + # + # TODO: Update the ingest script so its core functionality + # can be invoked directly (e.g. as a function) without + # needing to patch `sys.argv`. + # + ingest_cli_args = [ + "ingest_data.py", + "--mongo-uri", + f"mongodb://{cfg.mongo_username}:{cfg.mongo_password}@{cfg.mongo_host}:{cfg.mongo_port}", + "--db-name", + cfg.mongo_database, + "--input", + "tests/data", + "--clean", + ] + with patch.object(sys, "argv", ingest_cli_args): + ingest_main() + assert len(db.list_collection_names()) > 0 + + # Yield a reference to the now-seeded test database. + yield db + + # Drop the test database. + mongo_client.drop_database(cfg.mongo_database) + + # Close the Mongo connection. + mongo_client.close() + + +class TestBertronAPI: + r""" + Test suite for BERtron API endpoints assuming data is loaded. + + TODO: Remove prerequisite of data having been loaded by the `ingest` script. + Instead, implement a sufficient fixture within the test suite. + """ + + def test_get_all_entities(self, test_client: TestClient, seeded_db: Database): + """Test getting all entities from the collection.""" + response = test_client.get("/bertron") + + assert response.status_code == status.HTTP_200_OK + entities_data = response.json() + + # Verify response structure matches EntitiesResponse + assert "documents" in entities_data + assert "count" in entities_data + + # Verify data types + assert isinstance(entities_data["documents"], list) + assert isinstance(entities_data["count"], int) + + # Count should match the length of documents + assert entities_data["count"] == len(entities_data["documents"]) + + # If we have entities, verify structure of first entity + if entities_data["count"] > 0: + entity = entities_data["documents"][0] + self._verify_entity_structure(entity) + + def test_get_entity_by_id_emsl(self, test_client: TestClient, seeded_db: Database): + """Test getting a specific EMSL entity by ID.""" + entity_id = "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488" + response = test_client.get(f"/bertron/{entity_id}") + + assert response.status_code == status.HTTP_200_OK + entity = response.json() + + # Verify this is the correct entity + assert entity["id"] == entity_id + assert entity["ber_data_source"] == "EMSL" + assert entity["name"] == "EMSL Sample c9405190-e962-4ba5-93f0-e3ff499f4488" + assert entity["description"] == "Clostridium thermocellum protein extracts" + + # Verify coordinates + assert entity["coordinates"]["latitude"] == 34 + assert entity["coordinates"]["longitude"] == 118.0 + + self._verify_entity_structure(entity) + + # TODO: Consider using URL encoding (a.k.a. "percent-encoding") for the slashes. + def test_get_entity_by_id_ess_dive( + self, test_client: TestClient, seeded_db: Database + ): + """Test getting a specific ESS-DIVE entity by ID.""" + entity_id = "doi:10.15485/2441497" + response = test_client.get(f"/bertron/{entity_id}") + + assert response.status_code == status.HTTP_200_OK + entity = response.json() + + # Verify this is the correct entity + assert entity["id"] == entity_id + assert entity["ber_data_source"] == "ESS-DIVE" + assert "NGEE Arctic" in entity["name"] + + self._verify_entity_structure(entity) + + def test_get_entity_by_id_nmdc(self, test_client: TestClient, seeded_db: Database): + """Test getting a specific NMDC entity by ID.""" + entity_id = "nmdc:bsm-11-bsf8yq62" + response = test_client.get(f"/bertron/{entity_id}") + + assert response.status_code == status.HTTP_200_OK + entity = response.json() + + # Verify this is the correct entity + assert entity["id"] == entity_id + assert entity["ber_data_source"] == "NMDC" + assert entity["name"] == "DSNY_CoreB_TOP" + assert entity["description"] == "MONet sample represented in NMDC" + + # Verify coordinates with depth and elevation + assert entity["coordinates"]["latitude"] == 28.125842 + assert entity["coordinates"]["longitude"] == -81.434174 + assert entity["coordinates"]["depth"] is not None + assert entity["coordinates"]["elevation"] is not None + + self._verify_entity_structure(entity) + + def test_get_entity_by_id_not_found( + self, test_client: TestClient, seeded_db: Database + ): + """Test getting a non-existent entity returns 404.""" + entity_id = "nonexistent:12345" + response = test_client.get(f"/bertron/{entity_id}") + + assert response.status_code == status.HTTP_404_NOT_FOUND + error_data = response.json() + assert "not found" in error_data["detail"].lower() + + def test_find_entities_with_filter( + self, test_client: TestClient, seeded_db: Database + ): + """Test finding entities with MongoDB filter.""" + query = {"filter": {"ber_data_source": "EMSL"}, "limit": 10} + + response = test_client.post( + "/bertron/find", json=query, headers={"Content-Type": "application/json"} + ) + + assert response.status_code == 200 + entities_data = response.json() + + assert "documents" in entities_data + assert "count" in entities_data + assert isinstance(entities_data["documents"], list) + assert isinstance(entities_data["count"], int) + + # All returned entities should be from EMSL + for entity in entities_data["documents"]: + assert entity["ber_data_source"] == "EMSL" + self._verify_entity_structure(entity) + + def test_find_entities_with_projection( + self, test_client: TestClient, seeded_db: Database + ): + """Test finding entities with field projection.""" + query = { + "filter": {}, + "projection": {"id": 1, "name": 1, "ber_data_source": 1}, + "limit": 5, + } + + response = test_client.post( + "/bertron/find", json=query, headers={"Content-Type": "application/json"} + ) + + assert response.status_code == status.HTTP_200_OK + entities_data = response.json() + + assert entities_data["count"] <= 5 + + # Verify projected fields are present + for entity in entities_data["documents"]: + assert "id" in entity + assert "name" in entity + assert "ber_data_source" in entity + + def test_find_entities_with_sort_and_limit( + self, test_client: TestClient, seeded_db: Database + ): + """Test finding entities with sorting and limiting.""" + query = {"filter": {}, "sort": {"ber_data_source": 1, "id": 1}, "limit": 3} + + response = test_client.post( + "/bertron/find", json=query, headers={"Content-Type": "application/json"} + ) + + assert response.status_code == status.HTTP_200_OK + entities_data = response.json() + + assert entities_data["count"] <= 3 + assert len(entities_data["documents"]) <= 3 + + # Verify sorting (should be sorted by ber_data_source, then id) + if len(entities_data["documents"]) > 1: + for i in range(len(entities_data["documents"]) - 1): + current = entities_data["documents"][i] + next_entity = entities_data["documents"][i + 1] + assert current["ber_data_source"] <= next_entity["ber_data_source"] + + def test_find_entities_invalid_query( + self, test_client: TestClient, seeded_db: Database + ): + """Test finding entities with invalid MongoDB query.""" + query = {"filter": {"$invalid": "operator"}} + + response = test_client.post( + "/bertron/find", json=query, headers={"Content-Type": "application/json"} + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + error_data = response.json() + assert "Query error" in error_data["detail"] + + def test_geo_nearby_search(self, test_client: TestClient, seeded_db: Database): + """Test geographic nearby search.""" + # Search near the EMSL coordinates (34, 118.0) + params = { + "latitude": 34.0, + "longitude": 118.0, + "radius_meters": 100000, # 100km radius + } + + response = test_client.get("/bertron/geo/nearby", params=params) + + assert response.status_code == status.HTTP_200_OK + entities_data = response.json() + + assert "documents" in entities_data + assert "count" in entities_data + + # Should find at least the EMSL entity + found_emsl = False + for entity in entities_data["documents"]: + if entity["id"] == "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488": + found_emsl = True + self._verify_entity_structure(entity) + + assert found_emsl, "Should find the EMSL entity in nearby search" + + def test_geo_nearby_search_invalid_params( + self, test_client: TestClient, seeded_db: Database + ): + """Test geographic nearby search with invalid parameters.""" + params = { + "latitude": 91.0, # Invalid latitude + "longitude": 118.0, + "radius_meters": 1000, + } + + response = test_client.get("/bertron/geo/nearby", params=params) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + def test_geo_bounding_box_search( + self, test_client: TestClient, seeded_db: Database + ): + """Test geographic bounding box search.""" + # Bounding box around Alaska (ESS-DIVE data) + params = { + "southwest_lat": 64.0, + "southwest_lng": -166.0, + "northeast_lat": 66.0, + "northeast_lng": -163.0, + } + + response = test_client.get("/bertron/geo/bbox", params=params) + + assert response.status_code == status.HTTP_200_OK + entities_data = response.json() + + assert "documents" in entities_data + assert "count" in entities_data + + # Should find ESS-DIVE entities in Alaska + found_ess_dive = False + for entity in entities_data["documents"]: + if entity["ber_data_source"] == "ESS-DIVE": + found_ess_dive = True + # Verify coordinates are within bounding box + lat = entity["coordinates"]["latitude"] + lng = entity["coordinates"]["longitude"] + assert 64.0 <= lat <= 66.0 + assert -166.0 <= lng <= -163.0 + self._verify_entity_structure(entity) + + assert found_ess_dive, "Should find ESS-DIVE entities in Alaska bounding box" + + def test_geo_bounding_box_invalid_coordinates( + self, test_client: TestClient, seeded_db: Database + ): + """Test bounding box search with invalid coordinates.""" + params = { + "southwest_lat": 66.0, # Southwest lat > northeast lat + "southwest_lng": -163.0, + "northeast_lat": 64.0, + "northeast_lng": -166.0, + } + + response = test_client.get("/bertron/geo/bbox", params=params) + assert response.status_code == status.HTTP_400_BAD_REQUEST + error_data = response.json() + assert "latitude" in error_data["detail"].lower() + + def _verify_entity_structure(self, entity: Dict[str, Any]): + """Helper method to verify entity structure matches schema.""" + required_fields = [ + "id", + "name", + "description", + "ber_data_source", + "entity_type", + "coordinates", + ] + + for field in required_fields: + assert field in entity, f"Missing required field: {field}" + + # Verify coordinates structure + coords = entity["coordinates"] + assert "latitude" in coords + assert "longitude" in coords + assert isinstance(coords["latitude"], (int, float)) + assert isinstance(coords["longitude"], (int, float)) + + # Verify entity_type is a list + assert isinstance(entity["entity_type"], list) + assert len(entity["entity_type"]) > 0 + + # Verify ber_data_source is valid + valid_sources = ["EMSL", "ESS-DIVE", "NMDC", "JGI"] + assert entity["ber_data_source"] in valid_sources + + +# Integration test that combines multiple operations +class TestBertronAPIIntegration: + """Integration tests that combine multiple API operations.""" + + # No need for live server since we're using TestClient + # Uncomment the line below if you want to run against a test server + # base_url = "http://app:8000" + + def test_data_consistency_across_endpoints( + self, test_client: TestClient, seeded_db: Database + ): + """Test that the same entity returns consistent data across different endpoints.""" + entity_id = "EMSL:c9405190-e962-4ba5-93f0-e3ff499f4488" + + # Get entity by ID + response1 = test_client.get(f"/bertron/{entity_id}") + assert response1.status_code == status.HTTP_200_OK + entity_by_id = response1.json() + + # Find entity using filter + query = {"filter": {"id": entity_id}} + response2 = test_client.post( + "/bertron/find", json=query, headers={"Content-Type": "application/json"} + ) + assert response2.status_code == status.HTTP_200_OK + entities_data = response2.json() + assert entities_data["count"] == 1 + entity_by_filter = entities_data["documents"][0] + + # Both should return the same entity data + assert entity_by_id["id"] == entity_by_filter["id"] + assert entity_by_id["name"] == entity_by_filter["name"] + assert entity_by_id["ber_data_source"] == entity_by_filter["ber_data_source"] + assert entity_by_id["coordinates"] == entity_by_filter["coordinates"] + + def test_geographic_search_consistency( + self, test_client: TestClient, seeded_db: Database + ): + """Test that geographic searches return consistent results.""" + # Get all entities first + response = test_client.get("/bertron") + assert response.status_code == status.HTTP_200_OK + all_entities = response.json()["documents"] + + if len(all_entities) == 0: + pytest.skip("No entities in database for geographic consistency test") + + # Pick an entity with coordinates + test_entity = None + for entity in all_entities: + if ( + entity["coordinates"]["latitude"] is not None + and entity["coordinates"]["longitude"] is not None + ): + test_entity = entity + break + + if test_entity is None: + pytest.skip("No entities with valid coordinates for geographic test") + + lat = test_entity["coordinates"]["latitude"] + lng = test_entity["coordinates"]["longitude"] + + # Search with nearby (should include the entity) + nearby_params = { + "latitude": lat, + "longitude": lng, + "radius_meters": 1000, # 1km radius + } + nearby_response = test_client.get("/bertron/geo/nearby", params=nearby_params) + assert nearby_response.status_code == status.HTTP_200_OK + nearby_entities = nearby_response.json()["documents"] + + # The test entity should be found in nearby search + found_in_nearby = any(e["id"] == test_entity["id"] for e in nearby_entities) + assert found_in_nearby, ( + f"Entity {test_entity['id']} should be found in nearby search" + ) diff --git a/tests/test_hello.py b/tests/test_hello.py deleted file mode 100644 index e67e6cc..0000000 --- a/tests/test_hello.py +++ /dev/null @@ -1,4 +0,0 @@ -# A trivial test! - -def test_hello(): - assert True diff --git a/src/tests/test_server.py b/tests/test_server.py similarity index 94% rename from src/tests/test_server.py rename to tests/test_server.py index b51128a..eb0c9b0 100644 --- a/src/tests/test_server.py +++ b/tests/test_server.py @@ -9,8 +9,8 @@ from fastapi.testclient import TestClient from starlette import status -from models import HealthResponse, VersionResponse -from server import app +from src.models import HealthResponse, VersionResponse +from src.server import app @pytest.fixture diff --git a/uv.lock b/uv.lock index 03572a5..4f1c357 100644 --- a/uv.lock +++ b/uv.lock @@ -110,17 +110,16 @@ source = { editable = "." } dependencies = [ { name = "bertron-schema" }, { name = "fastapi", extra = ["standard"] }, + { name = "httpx" }, { name = "jsonschema" }, { name = "nmdc-api-utilities" }, { name = "pydantic-settings" }, { name = "pymongo" }, - { name = "pytest" }, { name = "uvicorn" }, ] [package.dev-dependencies] dev = [ - { name = "httpx" }, { name = "pre-commit" }, { name = "pyright" }, { name = "pytest" }, @@ -131,20 +130,19 @@ dev = [ requires-dist = [ { name = "bertron-schema", git = "https://github.com/ber-data/bertron-schema.git" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" }, + { name = "httpx", specifier = ">=0.28.1" }, { name = "jsonschema", specifier = ">=4.0.0" }, { name = "nmdc-api-utilities", specifier = ">=0.3.9" }, { name = "pydantic-settings", specifier = ">=2.10.1" }, { name = "pymongo", specifier = ">=4.13.1" }, - { name = "pytest", specifier = ">=8.4.0" }, { name = "uvicorn", specifier = ">=0.34.3" }, ] [package.metadata.requires-dev] dev = [ - { name = "httpx", specifier = ">=0.28.1" }, { name = "pre-commit", specifier = ">=4.1.0" }, { name = "pyright", specifier = ">=1.1.386" }, - { name = "pytest", specifier = ">=8.3.5" }, + { name = "pytest", specifier = ">=8.4.1" }, { name = "ruff", specifier = ">=0.9.9" }, ]