diff --git a/backend/app/controllers/bmdb_controller.py b/backend/app/controllers/bmdb_controller.py
new file mode 100644
index 0000000..3094233
--- /dev/null
+++ b/backend/app/controllers/bmdb_controller.py
@@ -0,0 +1,54 @@
+import httpx
+from typing import List
+from fastapi import HTTPException, Response
+from app.schemas.bmdb_schema import BMDBRequestParams
+from app.services.databases_service import (
+ get_xml_file,
+ fetch_bmdb_models,
+ get_bmdb_model_info,
+)
+
+
+async def get_bmdb_models_controller(params: BMDBRequestParams) -> dict:
+ """
+ Controller function to retrieve biomodels based on filters and sorting.
+ Raises:
+ HTTPException: If the BMDB API request fails.
+ """
+ try:
+ biomodels = await fetch_bmdb_models(params)
+ return biomodels
+ except httpx.HTTPStatusError as e:
+ raise HTTPException(
+ status_code=e.response.status_code, detail="Error fetching biomodels."
+ )
+ except httpx.RequestError as e:
+ raise HTTPException(
+ status_code=500, detail="Error communicating with BMDB API."
+ )
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+async def get_xml_controller(bmdbID: str, truncate: bool = False) -> str:
+ """
+ Controller function to fetch the contents of the XML file for a bmdb biomodel.
+ Raises:
+ HTTPException: If the URL cannot be generated.
+ """
+ try:
+ return await get_xml_file(bmdbID, truncate)
+ except Exception as e:
+ raise HTTPException(status_code=500, detail="Error fetching XML file.")
+
+
+async def get_bmdb_model_info_controller(bmdbID: str) -> dict:
+ """
+ Controller function to fetch information about a specific BMDB model.
+ Raises:
+ HTTPException: If the URL cannot be generated.
+ """
+ try:
+ return await get_bmdb_model_info(bmdbID)
+ except Exception as e:
+ raise HTTPException(status_code=500, detail="Error fetching BMDB model info.")
\ No newline at end of file
diff --git a/backend/app/controllers/llms_controller.py b/backend/app/controllers/llms_controller.py
index fe271f5..73bacb3 100644
--- a/backend/app/controllers/llms_controller.py
+++ b/backend/app/controllers/llms_controller.py
@@ -7,7 +7,7 @@
)
-async def get_llm_response(conversation_history: list[dict]) -> tuple[str, list]:
+async def get_llm_response(conversation_history: list[dict], database: str) -> tuple[str, list]:
"""
Controller function to interact with the LLM service.
Args:
@@ -16,8 +16,9 @@ async def get_llm_response(conversation_history: list[dict]) -> tuple[str, list]
tuple[str, list]: A tuple containing the final response and bmkeys list.
"""
try:
- result, bmkeys = await get_response_with_tools(conversation_history)
- return result, bmkeys
+ print("DEBUG20: BMDB POST: get_llm_response")
+ result, bmkeys, tool_summary = await get_response_with_tools(conversation_history, database)
+ return result, bmkeys, tool_summary
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
diff --git a/backend/app/controllers/vcelldb_controller.py b/backend/app/controllers/vcelldb_controller.py
index 93371ff..a95668a 100644
--- a/backend/app/controllers/vcelldb_controller.py
+++ b/backend/app/controllers/vcelldb_controller.py
@@ -2,7 +2,7 @@
from typing import List
from fastapi import HTTPException, Response
from app.schemas.vcelldb_schema import BiomodelRequestParams, SimulationRequestParams
-from app.services.vcelldb_service import (
+from app.services.databases_service import (
fetch_biomodels,
fetch_simulation_details,
get_vcml_file,
@@ -21,7 +21,9 @@ async def get_biomodels_controller(params: BiomodelRequestParams) -> dict:
HTTPException: If the VCell API request fails.
"""
try:
+ print("About to call fetch_biomodels()")
biomodels = await fetch_biomodels(params)
+ print("fetch_biomodels() completed successfully")
return biomodels
except httpx.HTTPStatusError as e:
raise HTTPException(
@@ -142,7 +144,9 @@ async def get_publications_controller() -> List[dict]:
HTTPException: If the VCell API request fails.
"""
try:
+ print("About to call fetch_publications()")
publications = await fetch_publications()
+ print("fetch_publications() completed successfully")
return publications
except httpx.HTTPStatusError as e:
raise HTTPException(
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
index 7a9df5b..40ece26 100644
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -1,4 +1,4 @@
-from pydantic_settings import BaseSettings
+from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
@@ -24,4 +24,10 @@ class Settings(BaseSettings):
LANGFUSE_PUBLIC_KEY: str
LANGFUSE_HOST: str
+ model_config = SettingsConfigDict(
+ env_file=".env",
+ env_file_encoding="utf-8",
+ extra="ignore",
+ )
+
settings = Settings()
diff --git a/backend/app/main.py b/backend/app/main.py
index f48d10b..bec6694 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -14,6 +14,7 @@
from app.routes.llms_router import router as llms_router
from app.routes.qdrant_router import router as qdrant_router
from app.routes.knowledge_base_router import router as knowledge_base_router
+from app.routes.bmdb_router import router as bmdb_router
ascii_art = """
╔════════════════════════════════════════════════════════════════════════════════════╗
@@ -54,6 +55,7 @@ async def startup_event():
# Including the routers
app.include_router(knowledge_base_router, tags=["Knowledge Base"], prefix="/kb")
app.include_router(llms_router, tags=["LLM with Tool Calling"])
+app.include_router(bmdb_router, tags=["BMDB API Wrapper"])
app.include_router(vcelldb_router, tags=["VCellDB API Wrapper"])
app.include_router(qdrant_router, tags=["Qdrant Vector DB"], prefix="/qdrant")
diff --git a/backend/app/routes/bmdb_router.py b/backend/app/routes/bmdb_router.py
new file mode 100644
index 0000000..42c9943
--- /dev/null
+++ b/backend/app/routes/bmdb_router.py
@@ -0,0 +1,42 @@
+from fastapi import APIRouter, Depends, HTTPException
+from app.schemas.bmdb_schema import BMDBRequestParams
+from app.controllers.bmdb_controller import (
+ get_bmdb_models_controller,
+ get_xml_controller,
+ get_bmdb_model_info_controller,
+)
+
+router = APIRouter()
+
+
+@router.get("/search", response_model=dict)
+async def get_biomodels(params: BMDBRequestParams = Depends()):
+ """
+ Endpoint to retrieve bmdb models based on provided parameters.
+ """
+ try:
+ return await get_bmdb_models_controller(params)
+ except HTTPException as e:
+ raise e
+
+
+@router.get("/get-xml", response_model=str)
+async def get_xml(bmdbID: str, truncate: bool = False):
+ """
+ Endpoint to get XML file contents for a given biomodel.
+ """
+ try:
+ return await get_xml_controller(bmdbID, truncate)
+ except HTTPException as e:
+ raise e
+
+
+@router.get("/model-info", response_model=dict)
+async def get_model_info(bmdbID: str):
+ """
+ Endpoint to get information about a specific BMDB model.
+ """
+ try:
+ return await get_bmdb_model_info_controller(bmdbID)
+ except HTTPException as e:
+ raise e
\ No newline at end of file
diff --git a/backend/app/routes/llms_router.py b/backend/app/routes/llms_router.py
index 35ed27a..3607ad8 100644
--- a/backend/app/routes/llms_router.py
+++ b/backend/app/routes/llms_router.py
@@ -1,4 +1,10 @@
-from fastapi import APIRouter
+from multiprocessing import process
+from fastapi import APIRouter, Depends, HTTPException, Response
+from typing import List
+from app.schemas.bmdb_schema import BMDBRequestParams
+from app.schemas.vcelldb_schema import BiomodelRequestParams
+import httpx
+import requests
from app.controllers.llms_controller import (
get_llm_response,
analyse_biomodel_controller,
@@ -8,6 +14,25 @@
router = APIRouter()
+# For BioModelsDB search using BioModelsDB API
+@router.post("/bmdb-search")
+async def search_llm(conversation_history: dict):
+ """
+ Endpoint to query the LLM and execute the necessary tools.
+ Args:
+ conversation_history (dict): The conversation history containing user prompts and responses.
+ database (str): The database to query - bmdb in this case.
+ Returns:
+ dict: The final response after processing the prompt with the tools.
+ """
+
+ print("DEBUG20: BMDB POST: ROUTER")
+ result, bmdbkeys, tool_summary = await get_llm_response(
+ conversation_history.get("conversation_history", []), database="bmdb"
+ )
+ return {"response": result, "bmkeys": bmdbkeys, "tool_summary": tool_summary}
+
+
@router.post("/query")
async def query_llm(conversation_history: dict):
@@ -15,13 +40,14 @@ async def query_llm(conversation_history: dict):
Endpoint to query the LLM and execute the necessary tools.
Args:
conversation_history (dict): The conversation history containing user prompts and responses.
+ database (str): The database to query - vcdb in this case.
Returns:
dict: The final response after processing the prompt with the tools.
"""
- result, bmkeys = await get_llm_response(
- conversation_history.get("conversation_history", [])
+ result, bmkeys, tool_summary = await get_llm_response(
+ conversation_history.get("conversation_history", []), database="vcdb"
)
- return {"response": result, "bmkeys": bmkeys}
+ return {"response": result, "bmkeys": bmkeys, "tool_summary": tool_summary}
@router.post("/analyse/{biomodel_id}")
diff --git a/backend/app/schemas/bmdb_schema.py b/backend/app/schemas/bmdb_schema.py
new file mode 100644
index 0000000..ce3d33f
--- /dev/null
+++ b/backend/app/schemas/bmdb_schema.py
@@ -0,0 +1,25 @@
+from pydantic import BaseModel
+from typing import Optional
+from datetime import date
+from enum import Enum
+
+
+class CategoryEnum(str, Enum):
+ all = "all"
+ public = "public"
+ shared = "shared"
+ tutorials = "tutorial"
+ educational = "educational"
+
+
+class OrderByEnum(str, Enum):
+ date_desc = "date_desc"
+ date_asc = "date_asc"
+ name_desc = "name_desc"
+ name_asc = "name_asc"
+
+
+# Biomodel Request Parameters schema
+class BMDBRequestParams(BaseModel, use_enum_values=True):
+ bmName: Optional[str] = "" # Name of the biomodel to search for
+ bmId: Optional[str] = "" # Biomodel ID
diff --git a/backend/app/schemas/vcelldb_schema.py b/backend/app/schemas/vcelldb_schema.py
index 51d467c..8964e47 100644
--- a/backend/app/schemas/vcelldb_schema.py
+++ b/backend/app/schemas/vcelldb_schema.py
@@ -33,7 +33,6 @@ class BiomodelRequestParams(BaseModel, use_enum_values=True):
OrderByEnum.date_desc
) # Order of results (default is "date_desc")
-
class SimulationRequestParams(BaseModel):
bmId: str # Biomodel ID for which simulations will be fetched
simId: str # Simulation ID to fetch specific simulation details
diff --git a/backend/app/services/vcelldb_service.py b/backend/app/services/databases_service.py
similarity index 75%
rename from backend/app/services/vcelldb_service.py
rename to backend/app/services/databases_service.py
index 8382024..a9c6a12 100644
--- a/backend/app/services/vcelldb_service.py
+++ b/backend/app/services/databases_service.py
@@ -8,8 +8,10 @@
from typing import List
VCELL_API_BASE_URL = "https://vcell.cam.uchc.edu/api/v0"
+BIOMODELS_API_URL = "https://biomodels.org/"
logger = get_logger("vcelldb_service")
+print("CHECK: in VCELL_DB_SERVICE")
def sanitize_vcml_content(vcml_content: str) -> str:
@@ -38,6 +40,10 @@ def sanitize_vcml_content(vcml_content: str) -> str:
logger.info("VCML content sanitized: ImageData tags removed")
return sanitized_content
+# def sanitize_xml_content(vcml_content: str) -> str:
+
+# return sanitized_content
+
async def check_vcell_connectivity() -> bool:
"""
@@ -75,8 +81,11 @@ async def fetch_biomodels(params: BiomodelRequestParams) -> dict:
Returns:
dict: A dictionary containing a list of biomodels with metadata.
"""
+
+ print("CHECK: in VCELL_DB_SERVICE")
# Transform None to "" (optional, only if needed for empty fields)
params_dict = {k: (v if v is not None else "") for k, v in params.dict().items()}
+ print("DEBUG: " + str(params_dict))
logger.info(f"Fetching biomodels with parameters: {params_dict}")
@@ -120,6 +129,7 @@ async def fetch_simulation_details(params: SimulationRequestParams) -> dict:
Returns:
Simulation: A Simulation object containing simulation details.
"""
+ print("CHECK: in VCELL_DB_SERVICE")
async with httpx.AsyncClient() as client:
response = await client.get(
f"{VCELL_API_BASE_URL}/biomodel/{params.bmId}/simulation/{params.simId}"
@@ -127,6 +137,126 @@ async def fetch_simulation_details(params: SimulationRequestParams) -> dict:
response.raise_for_status()
return response.json()
+@observe(name="FETCH_BMDB_MODELS")
+async def fetch_bmdb_models(params: BiomodelRequestParams) -> dict:
+ print("DEBUG20: BMDB POST: in tool FETCH_BMDB_MODELS")
+
+ # Construct the query string using urlencoded parameters (params_dict)
+ query_string = params.bmName if params.bmName else params.bmId
+
+ # Construct the full URL
+ url = f"{BIOMODELS_API_URL}search?query={query_string}&format=json"
+
+ # Log the URL being queried
+ logger.info(f"Querying URL: {url}")
+
+ # Perform the API request
+ async with httpx.AsyncClient() as client:
+ response = await client.get(url)
+ response.raise_for_status()
+ raw_data = response.json()
+
+ print("FINAL URL:", response.request.url)
+ print("STATUS CODE:", response.status_code)
+ print("RAW JSON:", raw_data)
+
+ # Extract list
+ biomodels = raw_data.get("models", [])
+
+ # Build response with metadata
+ return {
+ "search_params": {
+ "bmId": params.bmId,
+ "bmName": params.bmName
+ },
+ "models_count": len(biomodels),
+ "data": biomodels
+ }
+
+
+@observe(name="GET_XML_FILE")
+async def get_xml_file(bmId: str, truncate: bool = False, max_retries: int = 3) -> str:
+
+ logger.info(f"Fetching XML file for biomodel: {bmId}")
+
+ # Check connectivity first
+ if not await check_vcell_connectivity():
+ logger.error(
+ "BMDB API is not reachable. Please check your network connection and DNS settings."
+ )
+ raise Exception(
+ "BMDB API is not reachable. Please check your network connection and DNS settings."
+ )
+
+ for attempt in range(max_retries + 1):
+ try:
+ url = f"{BIOMODELS_API_URL}model/download/{bmId}?filename={bmId}_url.xml"
+ logger.info(
+ f"Requesting URL: {url} (attempt {attempt + 1}/{max_retries + 1})"
+ )
+
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(url)
+ logger.info(f"Response status: {response.status_code}")
+ logger.info(f"Response headers: {dict(response.headers)}")
+ response.raise_for_status()
+
+ return response.text
+ # if truncate:
+ # return sanitize_vcml_content(response.text[:500])
+ # else:
+ # return sanitize_vcml_content(response.text)
+
+ except httpx.HTTPStatusError as e:
+ logger.error(
+ f"HTTP error fetching XML file for biomodel {bmId}: {e.response.status_code} - {e.response.text}"
+ )
+ if attempt == max_retries:
+ raise e
+ logger.warning(f"Retrying in {2 ** attempt} seconds...")
+ await asyncio.sleep(2**attempt)
+
+ except httpx.RequestError as e:
+ logger.error(
+ f"Request error fetching XML file for biomodel {bmId}: {str(e)}"
+ )
+ if attempt == max_retries:
+ raise e
+ logger.warning(f"Retrying in {2 ** attempt} seconds...")
+ await asyncio.sleep(2**attempt)
+
+ except Exception as e:
+ logger.error(
+ f"Unexpected error fetching XML file for biomodel {bmId}: {str(e)}"
+ )
+ if attempt == max_retries:
+ raise e
+ logger.warning(f"Retrying in {2 ** attempt} seconds...")
+ await asyncio.sleep(2**attempt)
+
+ # This should never be reached, but just in case
+ raise Exception(
+ f"Failed to fetch XML file for biomodel {bmId} after {max_retries + 1} attempts"
+ )
+
+
+@observe(name="GET_BMDB_MODEL_INFO")
+async def get_bmdb_model_info(bmdbID: str) -> dict:
+ """
+ Fetches information about a specific given model from BMDB.
+ """
+ url = f"{BIOMODELS_API_URL}/{bmdbID}?format=json"
+
+ logger.info(f"Fetching BMDB model info from URL: {url}")
+
+ async with httpx.AsyncClient() as client:
+ response = await client.get(url)
+ response.raise_for_status()
+ raw_data = response.json()
+
+ # returns dictionary with model info, including name, description, etc.
+ return raw_data
+
@observe(name="GET_VCML_FILE")
async def get_vcml_file(
diff --git a/backend/app/services/llms_service.py b/backend/app/services/llms_service.py
index 5d5e91d..c12948b 100644
--- a/backend/app/services/llms_service.py
+++ b/backend/app/services/llms_service.py
@@ -1,15 +1,21 @@
+# IMPLEMENTATION: separating tools into subsets and sending only relevant tools to llm
from app.utils.tools_utils import (
- ToolsDefinitions as tools,
- execute_tool,
+ BMDB_TOOLS as bmdbtools,
+ execute_tool,
+ select_tools_for_prompt,
+ should_use_tools,
+ default_rows,
)
-from app.services.vcelldb_service import (
+from app.services.databases_service import (
fetch_biomodels,
get_vcml_file,
get_diagram_url,
)
from app.utils.system_prompt import SYSTEM_PROMPT
+from app.utils.bmdb_system_prompt import BMDB_SYSTEM_PROMPT
+from app.utils.vcdb_system_prompt import VCDB_SYSTEM_PROMPT
from app.schemas.vcelldb_schema import BiomodelRequestParams
from app.core.singleton import get_openai_client
@@ -17,9 +23,61 @@
import json
from app.core.logger import get_logger
+import time
+
+# adding specific time logs for easier profiling
+def log_timing(label: str, start: float):
+ duration = time.perf_counter() - start
+ logger.info(f"{label}: {duration:.3f}s")
+
logger = get_logger("llm_service")
client = get_openai_client()
+# IMPLEMENTATION: extract the last user message from the conversation history
+def _last_user_message(conversation_history: list[dict]) -> str:
+ for msg in reversed(conversation_history):
+ if msg.get("role") == "user" and msg.get("content"):
+ return str(msg["content"]).strip()
+ return ""
+
+# IMPLEMENTATION: directly call llm without any tools for simple, conversational queries
+def _direct_chat_completion(messages: list[dict]) -> str:
+ response = client.chat.completions.create(
+ name="GET_RESPONSE_DIRECT",
+ model=settings.AZURE_DEPLOYMENT_NAME,
+ messages=messages,
+ )
+ return response.choices[0].message.content or ""
+
+
+# do not change the tool call formatting, only shorten results
+# this way the llm will stop returning false results
+def summarize_tool_result(result):
+ if isinstance(result, dict) and "models" in result:
+ return {
+ "models": [
+ {
+ "id": m.get("id"),
+ "name": m.get("name"),
+ "description": m.get("description", "")[:200],
+ "score": m.get("score"), # keep useful signals
+ }
+ for m in result["models"][:5]
+ ],
+ "total": result.get("total"),
+ }
+
+ return result
+
+
+# adding specific time logs for easier profiling
+async def timed_tool_call(name, args):
+ start = time.perf_counter()
+ result = await execute_tool(name, args)
+ log_timing(f"TOOL {name}", start)
+ return result
+
+
async def get_llm_response(system_prompt: str, user_prompt: str):
"""
@@ -44,27 +102,74 @@ async def get_llm_response(system_prompt: str, user_prompt: str):
return response.choices[0].message.content
-async def get_response_with_tools(conversation_history: list[dict]):
+async def get_response_with_tools(conversation_history: list[dict], database: str):
+ # start the total request timer for timing of the entire process
+ total_start = time.perf_counter()
messages = [
{
"role": "system",
- "content": SYSTEM_PROMPT,
+ "content": SYSTEM_PROMPT + (BMDB_SYSTEM_PROMPT if database == "bmdb" else VCDB_SYSTEM_PROMPT),
},
]
messages = messages + conversation_history
- user_prompt = conversation_history[-1]["content"]
+ # create a summary string of all timing logs to print to frontend
+ tool_summary = ""
- logger.info(f"User prompt: {user_prompt}")
+ # llm tool selection call
+ llm1_start = time.perf_counter()
- response = client.chat.completions.create(
- name="GET_RESPONSE_WITH_TOOLS::RETRIEVE_TOOLS",
- model=settings.AZURE_DEPLOYMENT_NAME,
- messages=messages,
- tools=tools,
- tool_choice="auto",
- )
+ if database == "bmdb":
+ print("DEBUG20: BMDB POST: get_response_with_tools")
+ response = client.chat.completions.create(
+ model=settings.AZURE_DEPLOYMENT_NAME,
+ messages=messages,
+ tools=bmdbtools,
+ tool_choice="auto",
+ )
+
+ # IMPLEMENTATION: changing the way llm sees/chooses tools
+ elif database == "vcdb":
+ # extract last user message
+ user_prompt = _last_user_message(conversation_history)
+ logger.info(f"User prompt: {user_prompt}")
+
+ # avoid the tool-calling process for simple, conversational promptsß
+ if not should_use_tools(user_prompt):
+ # if no tools are used, then skip to immediate response
+ llm_direct_start = time.perf_counter()
+
+ # generate the response directly
+ final_response = _direct_chat_completion(messages)
+
+ # log timing for profiling
+ log_timing("LLM direct (no tools)", llm_direct_start)
+ log_timing("TOTAL REQUEST", total_start)
+
+ # return response with no tool calls
+ return final_response, [], "" # no tool summary since no tools used
+
+ # only include relevant tools to the llm instead of all tools
+ selected_tools = select_tools_for_prompt(user_prompt)
+ logger.info(f"TOOL SUBSET: {selected_tools}")
+
+ # first llm call to decide which tool to use from the given subset
+ response = client.chat.completions.create(
+ name="GET_RESPONSE_WITH_TOOLS::RETRIEVE_TOOLS",
+ model=settings.AZURE_DEPLOYMENT_NAME,
+ messages=messages,
+ tools=selected_tools,
+ tool_choice="auto",
+ )
+
+ # log timing after the llm selects which tool to use
+ log_timing("LLM1 - selecting tools from the subset", llm1_start)
+ llm1_time = time.perf_counter() - llm1_start
+ print(selected_tools)
+ tool_summary += f"*We selected subset tools: {', '.join([t.function.name for t in selected_tools])}* "
+ tool_summary += f"*The LLM call to select tools from the subset took {llm1_time:.2f}s.* "
+ tool_summary += f"*The LLM chose to use {len(response.choices[0].message.tool_calls)} tool(s) from the subset.* "
# Handle the tool calls
tool_calls = response.choices[0].message.tool_calls
@@ -73,45 +178,100 @@ async def get_response_with_tools(conversation_history: list[dict]):
bmkeys = []
- if tool_calls:
- for tool_call in tool_calls:
- # Extract the function name and arguments
- name = tool_call.function.name
- args = json.loads(tool_call.function.arguments)
- logger.info(f"Tool Call: {name} with args: {args}")
+ # introduce a fast path: if no tool_calls, return immediately
+ if not tool_calls:
+ direct_text = response.choices[0].message or ""
+ logger.info(f"LLM Response (no tools): {direct_text}")
+ return direct_text, bmkeys, ""
- # Execute the tool function
- result = await execute_tool(name, args)
+ # perform tool calls concurrently rather than sequentially to reduce response time
+ if tool_calls:
+ import asyncio
+ import json
- logger.info(f"Tool Result: {str(result)[:500]}")
+ # execute all tool calls concurrently
+ tasks = []
+ parsed_calls = []
+ tool_timings = []
+ for tool_call in tool_calls:
+ name = tool_call.function.name
+ args = json.loads(tool_call.function.arguments)
+ parsed_calls.append((tool_call, name, args))
+ tasks.append(timed_tool_call(name, args))
+
+ # log timing for how long the tool calls take to execute in total
+ tools_total_start = time.perf_counter()
+ results = await asyncio.gather(*tasks)
+
+ # log total time for all tool calls together
+ tools_total_time = time.perf_counter() - tools_total_start
+ log_timing("EXECUTION OF TOOL CALLS", tools_total_start)
+ tool_summary += f"*Executing the tool calls took {tools_total_time:.2f}s.* "
+
+
+ for (tool_call, name, args), result in zip(parsed_calls, results):
+ compact_result = summarize_tool_result(result)
+ messages.append({
+ "role": "tool",
+ "tool_call_id": tool_call.id,
+ "content": json.dumps(compact_result, ensure_ascii=False),
+ })
+
+ # log timing for each individual tool call
+ tool_timings.append({
+ "tool_name": name,
+ "args": args,
+ "duration_s": round(time.perf_counter() - tools_total_start, 3)
+ })
+ logger.info(f"Individual tool call timings: {tool_timings}")
+ tool_summary += f"Executing each tool call took: " + ", ".join([f"{t['tool_name']} ({t['duration_s']}s)" for t in tool_timings]) + "."
+
+ # extract the bmkeys
+ for tool_call in tool_calls:
+ bmkeys = []
# Extract bmkeys only if result is a dictionary and contains the expected key
if isinstance(result, dict):
- bmkeys = result.get("unique_model_keys (bmkey)", [])
+ if database == "vcdb":
+ bmkeys = result.get("unique_model_keys (bmkey)", [])
+ elif database == "bmdb":
+ bmdb_models = result.get("data", [])
+ bmkeys = [model.get("id") for model in bmdb_models if model.get("id")]
- # Send the result back to the model
- messages.append(
- {"role": "tool", "tool_call_id": tool_call.id, "content": str(result)}
- )
+
+ logger.info("DEBUG100-START")
+ print(len(str(messages)))
+ print("DEBUG100: ", messages)
- logger.info(str(messages))
+ # log timing for the final llm call that uses the tool result
+ llm2_start = time.perf_counter()
# Send back the final response incorporating the tool result
completion = client.chat.completions.create(
name="GET_RESPONSE_WITH_TOOLS::PROCESS_TOOL_RESULTS",
model=settings.AZURE_DEPLOYMENT_NAME,
messages=messages,
- metadata={
- "tool_calls": tool_calls,
- },
+ # metadata={
+ # "tool_calls": tool_calls,
+ # },
)
+ llm2_time = time.perf_counter() - llm2_start
+ log_timing("LLM2 (final response)", llm2_start)
+ tool_summary += f"*The final LLM call took {llm2_time:.2f}s.* "
+
+ logger.info("DEBUG100-END")
+
final_response = completion.choices[0].message.content
logger.info(f"LLM Response: {final_response}")
+ log_timing("TOTAL REQUEST TIME (from initial request to final output)", total_start)
+ total_time = time.perf_counter() - total_start
+ tool_summary += f"*Total request time: {total_time:.2f}s.*"
+ tool_summary += f"\n*Max rows fetched for list of biomodels was {default_rows}.*"
- return final_response, bmkeys
+ return final_response, bmkeys, tool_summary
async def analyse_vcml(biomodel_id: str):
diff --git a/backend/app/tests/test_vcelldb_service.py b/backend/app/tests/test_vcelldb_service.py
index a55e024..8dc9ee5 100644
--- a/backend/app/tests/test_vcelldb_service.py
+++ b/backend/app/tests/test_vcelldb_service.py
@@ -7,7 +7,7 @@
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
-from app.services.vcelldb_service import (
+from app.services.databases_service import (
fetch_biomodels,
fetch_simulation_details,
get_vcml_file,
diff --git a/backend/app/utils/bmdb_system_prompt.py b/backend/app/utils/bmdb_system_prompt.py
new file mode 100644
index 0000000..d93d6db
--- /dev/null
+++ b/backend/app/utils/bmdb_system_prompt.py
@@ -0,0 +1,27 @@
+BMDB_SYSTEM_PROMPT = """
+## Formatting Guidelines for Biomodels
+You MUST follow this exact output format. Do NOT modify, omit, or reorder any fields.
+ALWAYS use the provided name and biomodelID exactly. Format the name as [name](/search/${id}).
+
+### Formatting Guidelines for biomodels retrieved from BioModels database (BMDB)
+* For each BMDB model:
+```
+1. **[Biomodel Name](/search/${id})**
+ - **Biomodel Key:** ${id}
+ - **Owner:** ${owner}
+ - **Description:** ${description or summary of the biomodel, do not include `clonedFrom` info}
+```
+
+### Rules for LONG LISTS (>10 models)
+
+- ALWAYS continue numbering sequentially (1, 2, 3, ...)
+- Repeat the EXACT same structure for EVERY item
+- If applications exist, do NOT omit them
+- Do NOT summarize or shorten later items
+- Do NOT merge multiple models into one entry
+- Maintain identical formatting across all entries
+
+### Biomodel Analysis Guidelines
+* Include as many relevant details as possible, such as biomodel ID, names, descriptions, parameters, and any other relevant metadata that can aid in the user's understanding.
+* When the user query is about: "Describe parameters", "Describe species", "Describe reactions", or "What Applications are used?" — specifically in the context of model analysis: Make sure to use the `get_xml_file` tool to retrieve the SBML XML file for the BMDB biomodel. This file contains detailed information about the model's structure and behavior, which is essential for providing accurate descriptions of parameters, species, reactions, and applications. Use also the "fetch_bmdb_models" tool to gather additional context about the biomodel, and Try when asked these questions to focus on the asked aspects, Do not provide general summaries, model structure, or unrelated metadata unless explicitly requested. Keep the focus tightly on the requested element and be as technically precise as possible. Elaborate as much as you can on the requested aspect, providing detailed descriptions and explanations based on the SBML XML content.
+"""
diff --git a/backend/app/utils/system_prompt.py b/backend/app/utils/system_prompt.py
index da79c67..962a34b 100644
--- a/backend/app/utils/system_prompt.py
+++ b/backend/app/utils/system_prompt.py
@@ -1,9 +1,8 @@
SYSTEM_PROMPT = """
-You are a VCell BioModel Assistant, designed to help users understand and interact with biological models in VCell.
+You are a mathematical modeler in biology, designed to help users understand and interact with biological models in VCell, and in
+SBML format (taken from BioModels database, also called BMDB or BioModels.org).
Your task is to provide human-readable, accurate, detailed, and contextually appropriate responses based on the tools available.
-## Core Guidelines
-
### General Guidelines
* Stick strictly to the user's query.
* Do not make assumptions or inferences about missing or incomplete information in the user's input.
@@ -11,17 +10,19 @@
* You can call tools multiple times if needed to gather sufficient data or refine your answer.
* If asked about irrelevant topics, politely decline to answer.
-### Formatting Guidelines
-* When using mathematical expressions, wrap them properly: use `$expression$` for inline math (e.g., $k_{on}$, $\text{mmol}\cdot\text{ml}^{-1}$) and `$$expression$$` for display math blocks. Always use `\text{}` for text within math mode (e.g., $\text{Sos (Inactive)}$, $\text{concentration}$).
-* Format all units, chemical names, reaction rates, and numerical expressions using math mode to ensure proper rendering. Example: "The rate is $5.2 \times 10^{-3} \text{ mmol}\cdot\text{ml}^{-1}\cdot\text{min}^{-1}$".
-* If there is an opportunity for follow-up questions or further actions, always ask the user if they'd like to explore more options or if you can assist with other related tasks.
+### Formatting Guidelines for Mathematical Expressions
+* When using mathematical expressions, wrap them properly: use `$expression$` for inline math
+(e.g., $k_{on}$, $\text{mmol}\cdot\text{ml}^{-1}$) and `$$expression$$` for display math blocks. Always
+use `\text{}` for text within math mode (e.g., $\text{Sos (Inactive)}$, $\text{concentration}$).
+* Format all units, chemical names, reaction rates, and numerical expressions using math mode to ensure
+proper rendering. Example: "The rate is $5.2 \times 10^{-3} \text{ mmol}\cdot\text{ml}^{-1}\cdot\text{min}^{-1}$".
-### Biomodel Analysis Guidelines
-* Include as many relevant details as possible, such as biomodel ID, names, descriptions, parameters, and any other relevant metadata that can aid in the user's understanding.
-* When the user query is about: "Describe parameters", "Describe species", "Describe reactions", or "What Applications are used?" — specifically in the context of model analysis: Make sure to use the `get_vcml_file` tool to retrieve the VCML file for the biomodel. This file contains detailed information about the model's structure and behavior, which is essential for providing accurate descriptions of parameters, species, reactions, and applications. Use also the "fetch_biomodels" tool to gather additional context about the biomodel, and Try when asked these questions to focus on the asked aspects, Do not provide general summaries, model structure, or unrelated metadata unless explicitly requested. Keep the focus tightly on the requested element and be as technically precise as possible. Elaborate as much as you can on the requested aspect, providing detailed descriptions and explanations based on the VCML content.
+### Formatting Guidelines for Elements with Identifiers.org Links
+* Any model element that includes a link to identifiers.org MUST be formatted as an underlined clickable link.
+* ONLY identifiers.org links should be formatted this way.
+* Do not hyperlink any other model elements (including names, descriptions, or internal links like /search/...).
-### Publications Guidelines
-* If asked for publications, research papers, pubmed articles, etc. use the `fetch_publications` tool. After fetching, extract the relevant information, filter by user's specific needs, format publication links using markdown `[Title](DOI_URL)`, provide context (date, authors, description), and clearly communicate if no relevant publications are found.
-* When using the `fetch_publications` tool, the response contains the full list of VCell related publications with fields: `pubKey` (unique identifier), `title`, `authors` (array), `year`, `citation` (full citation string in journal format), `pubmedid` (PubMed ID), `doi` (DOI link to the publication), `biomodelReferences` (array of related biomodels), and `mathmodelReferences` (array of related mathematical models).
-* When presenting publications, always provide elaborate, fact-based responses based solely on the available tool results.
+### Guidelines for Follow-up Questions and Further Actions
+* If there is an opportunity for follow-up questions or further actions, always ask the user if they'd like to explore
+more options or if you can assist with other related tasks.
"""
diff --git a/backend/app/utils/tools_utils.py b/backend/app/utils/tools_utils.py
index fe17151..af7ee9f 100644
--- a/backend/app/utils/tools_utils.py
+++ b/backend/app/utils/tools_utils.py
@@ -1,9 +1,11 @@
from typing import List
-from app.services.vcelldb_service import (
+from app.services.databases_service import (
fetch_biomodels,
fetch_simulation_details,
get_vcml_file,
fetch_publications,
+ fetch_bmdb_models,
+ get_xml_file
)
from app.services.knowledge_base_service import get_similar_chunks
from app.schemas.vcelldb_schema import BiomodelRequestParams, SimulationRequestParams
@@ -14,9 +16,15 @@
ParameterSchema,
)
from app.core.logger import get_logger
+import re
logger = get_logger("tools_utils")
+# NUMBER OF ROWS TO RETURN:
+min_rows = 1
+max_rows = 50
+default_rows = 25
+
# Function calling Definitions using Pydantic schema objects
fetch_biomodels_tool = ToolDefinition(
type="function",
@@ -65,7 +73,9 @@
},
"maxRows": {
"type": "integer",
- "default": 1000,
+ "default": default_rows,
+ "minimum": min_rows,
+ "maximum": max_rows,
"description": "The maximum number of results to return per page.",
},
"orderBy": {
@@ -178,6 +188,52 @@
),
)
+
+fetch_bmdb_tool = ToolDefinition(
+ type="function",
+ function=FunctionDefinition(
+ name="fetch_bmdb_models",
+ description="Retrieves a list of biomodels from the BioModels database based on filtering criteria which is the biomodel name. This allows to search for specific biomodels in the BioModels database based on their attributes and retrieve the results.",
+ parameters=ParameterSchema(
+ type="object",
+ properties={
+ "bmId": {
+ "type": "string",
+ "default": "",
+ "description": "The unique identifier of the biomodel. This can be used to retrieve specific biomodels directly by their ID. It is under the format BIOMD followed by 10 numbers or MODEL followed by 10 numbers.",
+ },
+ "bmName": {
+ "type": "string",
+ "default": "",
+ "description": "The name or part of the name of the biomodel you are searching for. This can be used to find biomodels that match the provided name or keyword.",
+ },},
+ required=["bmId", "bmName"],
+ additionalProperties=False,
+ ),
+ strict=True,
+ ),
+)
+
+get_xml_file_tool = ToolDefinition(
+ type="function",
+ function=FunctionDefinition(
+ name="get_xml_file",
+ description="Retrieves the SBML XML (eXtensible Markup Language) file content for a specified BioModels model (BIOMD ID). SBML (Systems Biology Markup Language) files provide a detailed, machine-readable representation of a biomodel's structure and behavior, which is used for simulation and model analysis. This function downloads the XML representation of a biomodel for further analysis.",
+ parameters=ParameterSchema(
+ type="object",
+ properties={
+ "bmId": {
+ "type": "string",
+ "description": "ID of the biomodel to retrieve VCML",
+ }
+ },
+ required=["bmId"],
+ additionalProperties=False,
+ ),
+ strict=True,
+ ),
+)
+
# List of all tool definitions
ToolsDefinitions = [
fetch_biomodels_tool,
@@ -186,6 +242,91 @@
search_vcell_knowledge_base_tool,
fetch_publications_tool,
]
+BMDB_TOOLS = [fetch_bmdb_tool,
+ get_xml_file_tool]
+
+
+# IMPLEMENTATION: separating all tool definitions into subsets
+DB_TOOLS = [
+ fetch_biomodels_tool,
+ fetch_simulation_details_tool,
+ get_vcml_file_tool,
+]
+KB_TOOLS = [
+ search_vcell_knowledge_base_tool,
+]
+PUB_TOOLS = [
+ fetch_publications_tool,
+]
+
+# decide which subset (if any) of tools to send to the llm
+# returning false skips tools and directly calls llm
+# returning true allows the llm to use tools
+def should_use_tools(prompt: str) -> bool:
+ if not prompt:
+ return False
+
+ p = prompt.lower().strip()
+
+ # common prefixes where tools are unnecessary
+ plain_chat_prefixes = (
+ "summarize this",
+ "improve this",
+ "make this clearer",
+ )
+ if p.startswith(plain_chat_prefixes):
+ return False
+
+ # each signal indicates when tools are needed
+ # list of patterns that suggest a database lookup/a structured retrieval
+ tool_signals = [
+ r"\b(list|show|find|get|fetch|search)\b",
+ r"\bmodel\b|\bmodels\b|\bbiomodel\b|\bbiomodels\b",
+ r"\bsimulation\b|\bsimulations\b",
+ r"\bvcml\b|\bxml\b",
+ r"\bpublication\b|\bpublications\b|\bpaper\b|\bpapers\b|\bpubmed\b",
+ r"\btutorial\b|\beducational\b|\bknowledge base\b",
+ r"\bhow do i\b|\bhow to\b|\bwhat is vcell\b",
+ r"\bBM\d+\b|\bBIOMD\d+\b",
+ ]
+
+ # if any tool signal matches then use tools
+ return any(re.search(pattern, p) for pattern in tool_signals)
+
+# select only a subset of tools based on the user prompt
+def select_tools_for_prompt(prompt: str):
+ p = (prompt or "").lower()
+
+ # tools that the llm will see when making its choice
+ selected = []
+
+ # Database/data-fetch intent
+ if re.search(r"\b(model|models|biomodel|biomodels|simulation|simulations|vcml|bm\d+)\b", p):
+ selected.extend(DB_TOOLS)
+
+ # Publications intent
+ if re.search(r"\b(publication|publications|paper|papers|pubmed)\b", p):
+ selected.extend(PUB_TOOLS)
+
+ # Knowledge / tutorial / how-to intent
+ if re.search(r"\b(tutorial|educational|knowledge base|how do i|how to|what is vcell|explain)\b", p):
+ selected.extend(KB_TOOLS)
+
+ # Default fallback: if tools are needed but no bucket matched, keep KB only.
+ if not selected:
+ selected = KB_TOOLS
+
+ # De-duplicate while preserving order
+ deduped = []
+ seen = set()
+ for tool in selected:
+ name = tool.function.name
+ if name not in seen:
+ deduped.append(tool)
+ seen.add(name)
+
+ return deduped
+
# Tool Executor Function
@@ -206,8 +347,9 @@ async def execute_tool(name, args):
# args["savedLow"] = None
# if args.get("savedHigh") == "":
# args["savedHigh"] = None
- args["maxRows"] = 1000
+ args["maxRows"] = default_rows
params = BiomodelRequestParams(**args)
+ print("DEBUG About to call fetch_biomodels()")
return await fetch_biomodels(params)
elif name == "fetch_simulation_details":
@@ -221,10 +363,17 @@ async def execute_tool(name, args):
query = args["query"]
limit = args.get("limit", 5)
logger.info(f"Executing tool: {name} with query {query}")
+ print("DEBUG About to call search_vcell_knowledge_base")
return get_similar_chunks(query=query, limit=limit)
elif name == "fetch_publications":
return await fetch_publications()
+
+ elif name == "fetch_bmdb_models":
+ params = BiomodelRequestParams(**args)
+ return await fetch_bmdb_models(params)
+ elif name == "get_xml_file":
+ return await get_xml_file(args["bmId"])
else:
return {}
diff --git a/backend/app/utils/vcdb_system_prompt.py b/backend/app/utils/vcdb_system_prompt.py
new file mode 100644
index 0000000..c645c30
--- /dev/null
+++ b/backend/app/utils/vcdb_system_prompt.py
@@ -0,0 +1,38 @@
+VCDB_SYSTEM_PROMPT = """
+### Publications Guidelines
+* If asked for publications, research papers, pubmed articles, etc. use the `fetch_publications` tool. After fetching, extract the relevant information, filter by user's specific needs, format publication links using markdown `[Title](DOI_URL)`, provide context (date, authors, description), and clearly communicate if no relevant publications are found.
+* When using the `fetch_publications` tool, the response contains the full list of VCell related publications with fields: `pubKey` (unique identifier), `title`, `authors` (array), `year`, `citation` (full citation string in journal format), `pubmedid` (PubMed ID), `doi` (DOI link to the publication), `biomodelReferences` (array of related biomodels), and `mathmodelReferences` (array of related mathematical models).
+* When presenting publications, always provide elaborate, fact-based responses based solely on the available tool results.
+
+
+## Formatting Guidelines for Biomodels
+You MUST follow this exact output format. Do NOT modify, omit, or reorder any fields.
+ALWAYS use the provided name and biomodelID exactly. Format the name as [name](/search/biomodelID).
+
+### Formatting Guidelines for biomodels retrieved from VCell database (VCDB)
+* For each VCELL model:
+```
+1. **[Biomodel Name](/search/${biomodelID})**
+ - **Biomodel Key:** ${biomodelId}
+ - **Owner:** ${owner}
+ - **Description:** ${description or summary of the biomodel, do not include `clonedFrom` info}
+ - **Applications:**
+
+List every application name for the model in italics, each on its own bullet point. Under each
+bulleted application name, list its corresponding simulations, with each simulation followed by a solver in round brackets.
+Do not omit any applications.
+```
+
+### Rules for LONG LISTS (>10 models)
+
+- ALWAYS continue numbering sequentially (1, 2, 3, ...)
+- Repeat the EXACT same structure for EVERY item
+- If applications exist, do NOT omit them
+- Do NOT summarize or shorten later items
+- Do NOT merge multiple models into one entry
+- Maintain identical formatting across all entries
+
+### Biomodel Analysis Guidelines
+* Include as many relevant details as possible, such as biomodel ID, names, descriptions, parameters, and any other relevant metadata that can aid in the user's understanding.
+* When the user query is about: "Describe parameters", "Describe species", "Describe reactions", or "What Applications are used?" — specifically in the context of model analysis: Make sure to use the `get_vcml_file` tool to retrieve the VCML file for the VCELL biomodel. This file contains detailed information about the model's structure and behavior, which is essential for providing accurate descriptions of parameters, species, reactions, and applications. Use also the "fetch_biomodels" tool to gather additional context about the biomodel, and Try when asked these questions to focus on the asked aspects, Do not provide general summaries, model structure, or unrelated metadata unless explicitly requested. Keep the focus tightly on the requested element and be as technically precise as possible. Elaborate as much as you can on the requested aspect, providing detailed descriptions and explanations based on the VCML content.
+"""
diff --git a/docker-compose.yml b/docker-compose.yml
index 958ec35..bb644a8 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -34,6 +34,9 @@ services:
container_name: frontend-vcell
ports:
- "3000:3000"
+ environment:
+ NEXT_PUBLIC_API_URL: http://localhost:8000
+ NEXT_PUBLIC_API_URL_BMDB: https://www.biomodels.org/
depends_on:
- backend
env_file:
diff --git a/frontend/app/admin/settings/page.tsx b/frontend/app/admin/settings/page.tsx
index 28fa6c7..f2ea618 100644
--- a/frontend/app/admin/settings/page.tsx
+++ b/frontend/app/admin/settings/page.tsx
@@ -277,7 +277,7 @@ export default function AdminSettingsPage() {
Follow the steps below to get started with your local deployment.
- For more details, check https://github.com/KacemMathlouthi/VCell-GSoC + For more details, check https://github.com/KacemMathlouthi/VCell-GSoC
@@ -287,10 +287,10 @@ export default function AdminSettingsPage() {
- git clone https://github.com/KacemMathlouthi/VCell-GSoC.git
+ git clone https://github.com/virtualcell/VCell-AI.git