Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion api/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ class Settings(BaseSettings):
DEBUG: bool = False

# used for validation
WS_LONGFORM_SCHEMA_URL: str = (
LONGFORM_SCHEMA_URL: str = (
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You saw the things here can be overridden by env vars? Just checking the var is named this and not what was there before? :-)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you see this and respond, you can either merge or fix + merge. ty!

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, this one didn't match the existing variable name, so I dropped the WS_ prefix for consistency with the rest of these vars. It gets mapped here:

https://github.com/TaskarCenterAtUW/workspaces-stack/blob/0f66fc91815b4965f817cc4c72b1122ed228081f/docker-compose.deploy.yml#L94

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perfect, copy that. Merging then.

"https://raw.githubusercontent.com/TaskarCenterAtUW/asr-quests/refs/heads/main/schema/schema.json"
)
IMAGERY_SCHEMA_URL: str = (
"https://raw.githubusercontent.com/TaskarCenterAtUW/asr-imagery-list/refs/heads/main/schema/schema.json"
)
Comment thread
cyrossignol marked this conversation as resolved.

# proxy destination--"osm-web" is a virtual docker network endpoint
WS_OSM_HOST: str = "http://osm-web"
Expand Down
124 changes: 124 additions & 0 deletions api/core/json_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import asyncio
import json
from typing import Any

import httpx
import jsonschema
from fastapi import HTTPException, status

from api.core.config import settings

# Shared HTTP client. Initialized by main.py lifespan.
_http_client: httpx.AsyncClient | None = None

_longform_schema: dict | None = None
_longform_schema_lock = asyncio.Lock()

_imagery_schema: dict | None = None
_imagery_schema_lock = asyncio.Lock()


def init_json_schema_client() -> None:
global _http_client
_http_client = httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=30, pool=10),
)


def get_http_client() -> httpx.AsyncClient | None:
return _http_client


async def close_json_schema_client() -> None:
global _http_client
if _http_client is not None:
await _http_client.aclose()
_http_client = None


async def _fetch_longform_schema() -> dict:
global _longform_schema
if _longform_schema is None:
async with _longform_schema_lock:
if _longform_schema is None:
response = await _http_client.get(settings.LONGFORM_SCHEMA_URL)
response.raise_for_status()
_longform_schema = response.json()
return _longform_schema


async def _fetch_imagery_schema() -> dict:
global _imagery_schema
if _imagery_schema is None:
async with _imagery_schema_lock:
if _imagery_schema is None:
response = await _http_client.get(settings.IMAGERY_SCHEMA_URL)
response.raise_for_status()
_imagery_schema = response.json()
return _imagery_schema


def _raise_for_fetch_error(e: Exception, label: str) -> None:
if isinstance(e, httpx.TimeoutException):
raise HTTPException(
status_code=status.HTTP_504_GATEWAY_TIMEOUT,
detail=f"Timed out fetching {label} schema",
)
raise HTTPException(
status_code=status.HTTP_502_BAD_GATEWAY,
detail=f"Failed to fetch {label} schema: {e}",
)


async def validate_quest_definition_schema(definition: str) -> None:
"""
Parse, type-check, and validate a quest definition string against the long-
form quest JSON schema.
"""
try:
parsed = json.loads(definition)
except json.JSONDecodeError as e:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"'definition' must be valid JSON: {e}",
)
if not parsed or not isinstance(parsed, dict):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="'definition' must be a JSON object.",
)

try:
schema = await _fetch_longform_schema()
except HTTPException:
raise
except Exception as e:
_raise_for_fetch_error(e, "quest")
Comment on lines +95 to +96
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify current broad exception handling and where it appears.
rg -n -C2 'except Exception as e:' api/core/json_schema.py

# Confirm narrowed fetch-only exception handling after applying the fix.
rg -n -C2 'except \(httpx\.TimeoutException, httpx\.HTTPError\) as e:' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 341


🏁 Script executed:

# Read the file to see the full context around the problematic lines
head -120 api/core/json_schema.py | tail -30

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 956


🏁 Script executed:

# Check the imports at the top of the file to see what's imported
head -20 api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 497


🏁 Script executed:

# Look for _raise_for_fetch_error function definition
rg -n -A 5 'def _raise_for_fetch_error' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 358


🏁 Script executed:

# Search for httpx imports to verify the exception types
rg -n 'import.*httpx|from.*httpx' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 94


🏁 Script executed:

# Get the complete _raise_for_fetch_error function
sed -n '61,80p' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 757


🏁 Script executed:

# Look for the _fetch_longform_schema and _fetch_imagery_schema functions to see what they do
rg -n -A 10 'async def _fetch_longform_schema|async def _fetch_imagery_schema' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 923


🏁 Script executed:

# Check what httpx exception types are available and used
rg -n 'httpx\.' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 346


🌐 Web query:

httpx exception hierarchy TimeoutException HTTPError HTTPStatusError

💡 Result:

In the httpx library, all exceptions inherit from a common base class, HTTPError [1]. The exception hierarchy is structured to allow for granular error handling, distinguishing between general request issues, transport-level problems, and HTTP status errors [1][2]. The relationship between the requested exceptions is as follows: 1. HTTPError: This is the base class for all exceptions raised by the httpx library [1][2]. 2. TimeoutException: This class inherits from TransportError, which in turn inherits from RequestError, and finally from HTTPError [1]. It serves as the base class for all timeout-related errors, including ConnectTimeout, ReadTimeout, WriteTimeout, and PoolTimeout [1][2]. 3. HTTPStatusError: This class inherits directly from HTTPError [1]. It is raised when a response has an error status code (e.g., 4xx or 5xx) and raise_for_status is called [2]. The full hierarchy is organized as follows [1]: - HTTPError - RequestError - TransportError - TimeoutException (base for ConnectTimeout, ReadTimeout, WriteTimeout, PoolTimeout) - NetworkError (base for ConnectError, ReadError, WriteError, CloseError) - ProtocolError (base for LocalProtocolError, RemoteProtocolError) - ProxyError - UnsupportedProtocol - DecodingError - TooManyRedirects - HTTPStatusError Other notable exceptions, such as InvalidURL, CookieConflict, and various StreamError subclasses, also exist outside of the RequestError branch but remain under the HTTPError umbrella [1][2].

Citations:


🏁 Script executed:

# Check if there are other exception handlers in the codebase for httpx
rg -n 'except.*httpx\.' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 61


🏁 Script executed:

# Check if json.JSONDecodeError is handled elsewhere or if there are similar patterns in the codebase
rg -n 'json\.JSONDecodeError' api/core/json_schema.py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 120


🏁 Script executed:

# Look at the usage context - what calls validate_quest_definition_schema
rg -rn 'validate_quest_definition_schema|validate_imagery_definition_schema' --type py

Repository: TaskarCenterAtUW/workspaces-backend

Length of output: 483


Catch specific exceptions instead of all exceptions around schema fetch.

Lines 95-96 and 115-116 currently catch all exceptions and convert them to 502 errors, which can hide real app bugs like malformed JSON responses. The fetch operations call _http_client.get(), response.raise_for_status(), and response.json(), which can raise httpx.HTTPError (covers all httpx exceptions including timeouts) and json.JSONDecodeError. Catch these specific exceptions:

-    except Exception as e:
+    except (httpx.HTTPError, json.JSONDecodeError) as e:
         _raise_for_fetch_error(e, "quest")

Apply the same fix at line 115-116 for the imagery schema fetch.

Also applies to: 115-116

🧰 Tools
🪛 Ruff (0.15.13)

[warning] 95-95: Do not catch blind exception: Exception

(BLE001)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@api/core/json_schema.py` around lines 95 - 96, Replace the broad except
Exception handlers around the HTTP/schema fetch logic with specific exception
catches: catch httpx.HTTPError (to cover network, status and timeout errors from
_http_client.get()/response.raise_for_status()) and json.JSONDecodeError (for
response.json() failures), and pass those exceptions into _raise_for_fetch_error
with the appropriate schema identifier ("quest" and "imagery"); update both
catch sites that currently call _raise_for_fetch_error(e, "quest") and
_raise_for_fetch_error(e, "imagery") so only these two exception types are
converted to 502s while letting other exceptions propagate.


try:
jsonschema.validate(instance=parsed, schema=schema)
except jsonschema.ValidationError as e:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"{e.message} at {list(e.path)}",
)


async def validate_imagery_definition_schema(definition: list[Any]) -> None:
"""
Validate the provided definition against the imagery list schema.
"""
try:
schema = await _fetch_imagery_schema()
except HTTPException:
raise
except Exception as e:
_raise_for_fetch_error(e, "imagery")

try:
jsonschema.validate(instance=definition, schema=schema)
except jsonschema.ValidationError as e:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"{e.message} at {list(e.path)}",
)
9 changes: 7 additions & 2 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from api.core import config
from api.core.config import settings
from api.core.database import get_task_session
from api.core.json_schema import close_json_schema_client, init_json_schema_client
from api.core.logging import get_logger, setup_logging
from api.core.security import (
UserInfo,
Expand Down Expand Up @@ -59,13 +60,15 @@ async def lifespan(_app: FastAPI):
timeout=httpx.Timeout(connect=10, read=7200, write=7200, pool=10),
)
init_tdei_client()
init_json_schema_client()

yield # App runs

# Run after app cleanup:
await _osm_client.aclose()
_osm_client = None
await close_tdei_client()
await close_json_schema_client()


app = FastAPI(
Expand Down Expand Up @@ -141,8 +144,10 @@ def get_workspace_repository(
AUTH_WHITELIST_PATTERNS = [
re.compile(p)
for p in [
r"^/api/0\.6/user/.*$", # used during authentication
r"^/api/0\.6/workspaces/[0-9]+/bbox\.json$", # used to get workspace bbox without workspace header, to be removed
# Creating/deleting workspaces and JOSM path rewriting:
r"^/api/0\.6/workspaces.*$",
# Provisioning users during authentication:
r"^/api/0\.6/user/.*$",
Comment on lines +147 to +150
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

Whitelist regex is overly broad and weakens access control.

Line 148 (^/api/0\.6/workspaces.*$) matches every /workspaces proxy path, so requests can skip X-Workspace checks far beyond create/delete operations.

Suggested change
-AUTH_WHITELIST_PATTERNS = [
-    re.compile(p)
-    for p in [
-        r"^/api/0\.6/workspaces.*$",
-        r"^/api/0\.6/user/.*$",
-    ]
-]
+AUTH_WHITELIST = [
+    (re.compile(r"^/api/0\.6/workspaces/create\.json$"), {"POST"}),
+    (re.compile(r"^/api/0\.6/workspaces/[0-9]+/delete\.json$"), {"POST", "DELETE"}),
+    (re.compile(r"^/api/0\.6/user/.*$"), {"GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS", "HEAD"}),
+]
...
-        if not any(p.fullmatch(request.url.path) for p in AUTH_WHITELIST_PATTERNS):
+        if not any(
+            p.fullmatch(request.url.path) and request.method in methods
+            for p, methods in AUTH_WHITELIST
+        ):
             raise HTTPException(
                 status_code=status.HTTP_400_BAD_REQUEST,
                 detail="No X-Workspace header supplied",
             )

Also applies to: 228-232

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@api/main.py` around lines 147 - 150, The whitelist regex
r"^/api/0\.6/workspaces.*$" is too permissive and bypasses X-Workspace checks
for all /workspaces paths; narrow the pattern(s) in api/main.py (and the similar
entries around lines 228-232) to match only the specific endpoints that should
bypass workspace checks (e.g., the exact create/delete/JOSM rewrite endpoints)
by using precise path tokens and anchors (avoid trailing ".*"), and if the
bypass is conditional on HTTP method, ensure the logic checks the request method
in the same handler (refer to the whitelist entries containing
r"^/api/0\.6/workspaces" and the corresponding JOSM/provisioning whitelist
entries to implement the stricter regexes or method-aware checks).

]
]

Expand Down
Loading
Loading