From 2ac57507aad91ead3c7039511b61a9934707a95e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20H=C3=A9risson?= Date: Thu, 21 May 2026 14:31:45 +0200 Subject: [PATCH 1/3] fix(rr_cache/rr_cache.py): add path to input filename --- rr_cache/rr_cache.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rr_cache/rr_cache.py b/rr_cache/rr_cache.py index 086c513..87a6a79 100644 --- a/rr_cache/rr_cache.py +++ b/rr_cache/rr_cache.py @@ -797,7 +797,7 @@ def _gen_reactions( else: reactions = getattr( rrCache, "_m_" + attribute + "_reactions_legacy" - )(dep_file, logger=logger) + )(_dep_file, logger=logger) logger.debug(" Writing data to file...") rrCache._store_cache_to_file( reactions, outfile, logger=logger @@ -1458,6 +1458,7 @@ def _m_rr_reactions_legacy( "rel_direction": int(row["Rule_relative_direction"]), "left": {row["Substrate_ID"]: 1}, "right": products, + "ec_numbers": row["EC_number"].split(",") if row["EC_number"] else [], } except ValueError: @@ -1469,6 +1470,8 @@ def _m_rr_reactions_legacy( "Problem converting rule_score: " + str(row["Score_normalized"]) ) + print(rr_reactions) + exit() return rr_reactions ## Generate complete reactions from the rxn_recipes.tsv from RetroRules From fe1586108b568520c0b6621c370479f7b2b0c7c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20H=C3=A9risson?= Date: Thu, 21 May 2026 14:32:47 +0200 Subject: [PATCH 2/3] feat: add missing reactions in rules from mnx --- rr_cache/config/config_rr2026.json | 3 +- rr_cache/rr_cache.py | 309 ++++++++++++++++++++++++++++- tests/data/metrics_rr2026.json | 2 +- tests/test_rrCache.py | 18 ++ 4 files changed, 326 insertions(+), 6 deletions(-) diff --git a/rr_cache/config/config_rr2026.json b/rr_cache/config/config_rr2026.json index db5b10a..16de2f7 100644 --- a/rr_cache/config/config_rr2026.json +++ b/rr_cache/config/config_rr2026.json @@ -44,7 +44,8 @@ "url": "https://www.metanetx.org/ftp/4.5/", "files": { "chem_xref.tsv": "2b88dfe3b990ee85c8d8eb51f6d7079bf042b14e4cd11a4cc6ceb841686f9ea7ef4a04358363561fbbb2187a9870a1930037a030283908788a572e7b96b91ec7", - "chem_prop.tsv": "00d8d09d52884961748d032288781c4d47929f12d5052b367116c411194fdfae879b15844e8a15bccd494c26702dae57fd682c3bc03b5fee3c3e92e138c1eac2" + "chem_prop.tsv": "00d8d09d52884961748d032288781c4d47929f12d5052b367116c411194fdfae879b15844e8a15bccd494c26702dae57fd682c3bc03b5fee3c3e92e138c1eac2", + "reac_prop.tsv": "1796b5d77c707d0cfbfce99cf0466ef6aff0fe61ea5ded043216aa334e5ecd1388364d214162342f470b51eca2dab4309dcec2b9e074abee47dcae974c674bd7" } }, "rr2": { diff --git a/rr_cache/rr_cache.py b/rr_cache/rr_cache.py index 87a6a79..d76b9cc 100644 --- a/rr_cache/rr_cache.py +++ b/rr_cache/rr_cache.py @@ -12,6 +12,7 @@ from re import findall as re_findall # from time import time as time_time +import requests from requests import exceptions as r_exceptions from hashlib import sha512 from pathlib import Path @@ -296,15 +297,316 @@ def get_list_of_compounds(self): def get_reaction(self, rxn_id: str): return self.__get_object("template_reactions", rxn_id) + def add_reaction(self, rxn, persist: bool = True): + """Add a reaction to `template_reactions` and optionally persist the cache. + + The `rxn` argument may be either: + - a dict-like object containing at least `left` and `right` mappings (and + optional `direction`, `main_left`, `main_right`), or + - a string reaction id (e.g. 'MNXR12345') — in which case this will only + work if a local `reac_prop.tsv` exists in the input-cache (no download). + + This function no longer attempts to download `reac_prop.tsv`; download + and scanning should be handled by `add_missing_reactions`. + + Args: + rxn: Reaction data (dict-like or string id) + persist (bool): Whether to persist changes to disk (default: True). Set to False + to keep changes in memory only, useful when adding multiple reactions + to avoid repeated file write operations. + + Returns the reaction dict on success, or None on failure. + """ + # Ensure template_reactions are loaded + try: + if not self.__hasattr("template_reactions"): + self.Load(attrs=["template_reactions"]) + tr = self.get("template_reactions") or {} + except Exception as e: + self.logger.error(f"Failed to load template_reactions: {e}") + return None + + # If passed a dict-like reaction, insert directly + if isinstance(rxn, dict): + # try to detect rxn_id inside dict + rxn_id = rxn.get("id") or rxn.get("rxn_id") + if not rxn_id: + self.logger.error("Reaction dict must include an 'id' or 'rxn_id' key") + return None + if rxn_id in tr: + return tr[rxn_id] + + # Minimal validation: require left/right + if "left" not in rxn or "right" not in rxn: + self.logger.error( + "Reaction dict must contain 'left' and 'right' mappings" + ) + return None + + tr[rxn_id] = { + "left": rxn.get("left", {}), + "right": rxn.get("right", {}), + "direction": rxn.get("direction", 0), + "main_left": rxn.get("main_left", []), + "main_right": rxn.get("main_right", []), + } + + # persist to in-memory cache + try: + self.set("template_reactions", tr) + # persist to disk if requested + if persist: + outfile = os_path.join( + self.__cache_dir, + rrCache.__cache["template_reactions"]["file"]["name"], + ) + rrCache._store_cache_to_file(tr, outfile, logger=self.logger) + except Exception as e: + self.logger.warning(f"Failed to persist template_reactions cache: {e}") + return tr[rxn_id] + + # If passed a string id, try local lookup only (no download) + if isinstance(rxn, str): + rxn_id = rxn + if rxn_id in tr: + return tr[rxn_id] + + reac_prop_file = os_path.join(self.__input__cache_dir, "reac_prop.tsv") + if not os_path.exists(reac_prop_file): + self.logger.error( + "Local reac_prop.tsv not found — cannot add reaction by id. " + "Use add_missing_reactions to download/scan and add reactions." + ) + return None + + reaction = rrCache._m_mnx_reaction_from_reac_prop( + reac_prop_file, rxn_id, logger=self.logger + ) + if reaction is None: + self.logger.warning( + f"Reaction {rxn_id} not found in local reac_prop.tsv" + ) + return None + + # reuse dict insertion path + return self.add_reaction({"id": rxn_id, **reaction}, persist=persist) + + self.logger.error("add_reaction expects a dict-like or string reaction id") + return None + + @staticmethod + def _m_mnx_reaction_from_reac_prop( + reac_prop_path: str, + rxn_id: str, + logger: Logger = getLogger(__name__), + ) -> Dict: + if not os_path.exists(reac_prop_path): + logger.error(f"MetaNetX reaction file not found: {reac_prop_path}") + return None + + with open(reac_prop_path, "rt", encoding="utf-8-sig") as f: + reader = csv_reader(f, delimiter="\t") + header = None + for row in reader: + if not row or len(row) == 0: + continue + if row[0].startswith("#ID"): + header = [h.lstrip("#") for h in row] + continue + if row[0].startswith("#"): + continue + if header is None: + continue + + row_dict = { + header[i]: row[i] if i < len(row) else "" + for i in range(len(header)) + } + if row_dict.get("ID") != rxn_id: + continue + + equation = row_dict.get("mnx_equation") or row_dict.get("equation") + if not equation: + logger.warning( + f"No equation found for reaction {rxn_id} in reac_prop.tsv" + ) + return None + + parsed = rrCache._read_equation(equation, rxn_id, logger) + if parsed is None: + return None + + left = parsed.get("left", {}) + right = parsed.get("right", {}) + main_left = [next(iter(left.keys()))] if left else [] + main_right = [next(iter(right.keys()))] if right else [] + + return { + "left": left, + "right": right, + "direction": 0, + "main_left": main_left, + "main_right": main_right, + } + + return None + def get_list_of_reactions(self): return self.__get_list_of_objects("template_reactions") + def add_missing_reactions(self, limit: int = None, logger: Logger = None) -> int: + """Scan `reac_prop.tsv` (download if missing) for reactions absent from + `template_reactions`, build reaction dicts, and add them via + `add_reaction` one by one. + + Args: + limit (int, optional): Maximum number of reactions to add. None means no limit. + logger (Logger, optional): Logger to use. Defaults to self.logger. + + Returns: + int: Number of reactions successfully added. + """ + if logger is None: + logger = self.logger + + # Ensure template_reactions loaded + try: + if not self.__hasattr("template_reactions"): + self.Load(attrs=["template_reactions"]) + tr = self.get("template_reactions") or {} + except Exception as e: + logger.error(f"Failed to load template_reactions: {e}") + return 0 + + # Ensure reac_prop.tsv exists (try configured download, then fallback) + reac_prop_file = os_path.join(self.__input__cache_dir, "reac_prop.tsv") + if not os_path.exists(reac_prop_file): + try: + mnx_source = rrCache.__cache_sources.get("mnx", {}) + reac_fingerprint = mnx_source.get("files", {}).get("reac_prop.tsv") + rrCache._download_if_not_exists_or_corrupted( + mnx_source.get("url", ""), + "reac_prop.tsv", + self.__input__cache_dir, + reac_fingerprint, + logger=logger, + ) + except Exception: + # fallback direct download using config URL + try: + mnx_source = rrCache.__cache_sources.get("mnx", {}) + # base_url = mnx_source.get("url", "https://www.metanetx.org/ftp/4.5/") + base_url = mnx_source.get("url") + fallback_url = f"{base_url}reac_prop.tsv" + logger.debug( + f"Downloading fallback reac_prop.tsv from {fallback_url}" + ) + r = requests.get(fallback_url, timeout=30) + r.raise_for_status() + if not os_path.isdir(self.__input__cache_dir): + makedirs(self.__input__cache_dir, exist_ok=True) + with open(reac_prop_file, "w", encoding="utf-8") as f: + f.write(r.text) + except Exception as e2: + logger.error(f"Cannot retrieve MetaNetX reac_prop.tsv: {e2}") + return 0 + + to_add = [] + try: + with open(reac_prop_file, "rt", encoding="utf-8-sig") as f: + reader = csv_reader(f, delimiter="\t") + header = None + for row in reader: + if not row or len(row) == 0: + continue + if row[0].startswith("#ID"): + header = [h.lstrip("#") for h in row] + continue + if row[0].startswith("#"): + continue + if header is None: + continue + + rxn_id = row[0] + if rxn_id in tr: + continue + + # Build row dict and parse equation + row_dict = { + header[i]: row[i] if i < len(row) else "" + for i in range(len(header)) + } + equation = row_dict.get("mnx_equation") or row_dict.get("equation") + if not equation: + logger.debug(f"Skipping {rxn_id}: no equation") + continue + + parsed = rrCache._read_equation(equation, rxn_id, logger) + if parsed is None: + logger.debug(f"Skipping {rxn_id}: failed to parse equation") + continue + + left = parsed.get("left", {}) + right = parsed.get("right", {}) + main_left = [next(iter(left.keys()))] if left else [] + main_right = [next(iter(right.keys()))] if right else [] + + to_add.append( + { + "id": rxn_id, + "left": left, + "right": right, + "direction": 0, + "main_left": main_left, + "main_right": main_right, + } + ) + if limit is not None and len(to_add) >= limit: + break + except Exception as e: + logger.error(f"Error while scanning reac_prop.tsv: {e}") + return 0 + + added = 0 + for rxn_dict in to_add: + res = self.add_reaction(rxn_dict, persist=False) + if res is not None: + added += 1 + + # Store cache to file once after all reactions are added + if added > 0: + try: + outfile = os_path.join( + self.__cache_dir, + rrCache.__cache["template_reactions"]["file"]["name"], + ) + tr = self.get("template_reactions") or {} + rrCache._store_cache_to_file(tr, outfile, logger=logger) + except Exception as e: + logger.warning(f"Failed to persist template_reactions cache: {e}") + + return added + def get_reaction_rule(self, rr_id: str): return self.__get_object("rr_reactions", rr_id) def get_list_of_reaction_rules(self): return self.__get_list_of_objects("rr_reactions") + def __contains__(self, id: str) -> bool: + if not isinstance(id, str): + return False + + for attr in ("cid_strc", "template_reactions", "rr_reactions"): + try: + if not self.__hasattr(attr): + self.Load(attrs=[attr]) + if id in self.get(attr): + return True + except Exception: + continue + return False + def __get_object(self, attr: str, id: str): try: if not self.__hasattr(attr): @@ -503,6 +805,8 @@ def Build(self, interactive: bool = DEFAULTS["interactive"]) -> None: logger=self.logger, ) # , deprecatedCID_cid, deprecatedRID_rid, logger) print_progress(self.logger) + # Add missing reactions from MetaNetX + self.add_missing_reactions(logger=self.logger) try: rrCache._gen_comp_xref_deprecatedCompID_compid( self.__input__cache_dir, self.__cache_dir, self.logger @@ -797,7 +1101,7 @@ def _gen_reactions( else: reactions = getattr( rrCache, "_m_" + attribute + "_reactions_legacy" - )(_dep_file, logger=logger) + )(dep_file, logger=logger) logger.debug(" Writing data to file...") rrCache._store_cache_to_file( reactions, outfile, logger=logger @@ -1458,7 +1762,6 @@ def _m_rr_reactions_legacy( "rel_direction": int(row["Rule_relative_direction"]), "left": {row["Substrate_ID"]: 1}, "right": products, - "ec_numbers": row["EC_number"].split(",") if row["EC_number"] else [], } except ValueError: @@ -1470,8 +1773,6 @@ def _m_rr_reactions_legacy( "Problem converting rule_score: " + str(row["Score_normalized"]) ) - print(rr_reactions) - exit() return rr_reactions ## Generate complete reactions from the rxn_recipes.tsv from RetroRules diff --git a/tests/data/metrics_rr2026.json b/tests/data/metrics_rr2026.json index 4e32edf..00090e6 100644 --- a/tests/data/metrics_rr2026.json +++ b/tests/data/metrics_rr2026.json @@ -8,7 +8,7 @@ "file_size": 243759725 }, "template_reactions": { - "length": 42700, + "length": 95420, "file_size": 8519350 }, "rr_reactions": { diff --git a/tests/test_rrCache.py b/tests/test_rrCache.py index 12eabb1..2f9ca5f 100644 --- a/tests/test_rrCache.py +++ b/tests/test_rrCache.py @@ -159,6 +159,24 @@ def test_get_reaction_rule(caches, reference_data, cspace: str, rule_id: str): assert caches[cspace].get_reaction_rule(rule_id) == retrorules[rule_id] +@pytest.mark.parametrize("cspace, cmpd_id", COMPOUND_CASES) +def test_contains_compound_in_cache(caches, cspace: str, cmpd_id: str): + assert cmpd_id in caches[cspace] + assert "NOT_A_VALID_ID" not in caches[cspace] + + +@pytest.mark.parametrize("cspace, rxn_id", REACTION_CASES) +def test_contains_reaction_in_cache(caches, cspace: str, rxn_id: str): + assert rxn_id in caches[cspace] + assert "NOT_A_VALID_ID" not in caches[cspace] + + +@pytest.mark.parametrize("cspace, rule_id", RULE_CASES) +def test_contains_rule_in_cache(caches, cspace: str, rule_id: str): + assert rule_id in caches[cspace] + assert "NOT_A_VALID_ID" not in caches[cspace] + + @pytest.mark.parametrize("cspace, rule_id", RULE_CASES) def test_get_list_of_reaction_rules(caches, reference_data, cspace: str, rule_id: str): rule_ids = caches[cspace].get_list_of_reaction_rules() From ca6744bbe0335f93fade705f4643f8f293103699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20H=C3=A9risson?= Date: Fri, 22 May 2026 22:23:06 +0200 Subject: [PATCH 3/3] chore: insert reac_prop into config file --- rr_cache/config/config_rr2026.json | 2 +- rr_cache/rr_cache.py | 574 +++++++++++------------------ tests/data/metrics_rr2026.json | 2 +- 3 files changed, 210 insertions(+), 368 deletions(-) diff --git a/rr_cache/config/config_rr2026.json b/rr_cache/config/config_rr2026.json index 16de2f7..d43945d 100644 --- a/rr_cache/config/config_rr2026.json +++ b/rr_cache/config/config_rr2026.json @@ -31,7 +31,7 @@ }, "template_reactions": { "deps": { - "file_deps": ["templates_metadata"] + "file_deps": ["templates_metadata", "reac_prop.tsv"] }, "file": { "url": "", diff --git a/rr_cache/rr_cache.py b/rr_cache/rr_cache.py index d76b9cc..af2d5a2 100644 --- a/rr_cache/rr_cache.py +++ b/rr_cache/rr_cache.py @@ -7,12 +7,15 @@ MolToInchiKey, ) from csv import DictReader as csv_DictReader, reader as csv_reader +from pandas import ( + read_csv as pd_read_csv, + DataFrame, +) +from io import StringIO from json import dump as json_dump, dumps as json_dumps, load as json_load from gzip import open as gzip_open, GzipFile from re import findall as re_findall -# from time import time as time_time -import requests from requests import exceptions as r_exceptions from hashlib import sha512 from pathlib import Path @@ -297,103 +300,6 @@ def get_list_of_compounds(self): def get_reaction(self, rxn_id: str): return self.__get_object("template_reactions", rxn_id) - def add_reaction(self, rxn, persist: bool = True): - """Add a reaction to `template_reactions` and optionally persist the cache. - - The `rxn` argument may be either: - - a dict-like object containing at least `left` and `right` mappings (and - optional `direction`, `main_left`, `main_right`), or - - a string reaction id (e.g. 'MNXR12345') — in which case this will only - work if a local `reac_prop.tsv` exists in the input-cache (no download). - - This function no longer attempts to download `reac_prop.tsv`; download - and scanning should be handled by `add_missing_reactions`. - - Args: - rxn: Reaction data (dict-like or string id) - persist (bool): Whether to persist changes to disk (default: True). Set to False - to keep changes in memory only, useful when adding multiple reactions - to avoid repeated file write operations. - - Returns the reaction dict on success, or None on failure. - """ - # Ensure template_reactions are loaded - try: - if not self.__hasattr("template_reactions"): - self.Load(attrs=["template_reactions"]) - tr = self.get("template_reactions") or {} - except Exception as e: - self.logger.error(f"Failed to load template_reactions: {e}") - return None - - # If passed a dict-like reaction, insert directly - if isinstance(rxn, dict): - # try to detect rxn_id inside dict - rxn_id = rxn.get("id") or rxn.get("rxn_id") - if not rxn_id: - self.logger.error("Reaction dict must include an 'id' or 'rxn_id' key") - return None - if rxn_id in tr: - return tr[rxn_id] - - # Minimal validation: require left/right - if "left" not in rxn or "right" not in rxn: - self.logger.error( - "Reaction dict must contain 'left' and 'right' mappings" - ) - return None - - tr[rxn_id] = { - "left": rxn.get("left", {}), - "right": rxn.get("right", {}), - "direction": rxn.get("direction", 0), - "main_left": rxn.get("main_left", []), - "main_right": rxn.get("main_right", []), - } - - # persist to in-memory cache - try: - self.set("template_reactions", tr) - # persist to disk if requested - if persist: - outfile = os_path.join( - self.__cache_dir, - rrCache.__cache["template_reactions"]["file"]["name"], - ) - rrCache._store_cache_to_file(tr, outfile, logger=self.logger) - except Exception as e: - self.logger.warning(f"Failed to persist template_reactions cache: {e}") - return tr[rxn_id] - - # If passed a string id, try local lookup only (no download) - if isinstance(rxn, str): - rxn_id = rxn - if rxn_id in tr: - return tr[rxn_id] - - reac_prop_file = os_path.join(self.__input__cache_dir, "reac_prop.tsv") - if not os_path.exists(reac_prop_file): - self.logger.error( - "Local reac_prop.tsv not found — cannot add reaction by id. " - "Use add_missing_reactions to download/scan and add reactions." - ) - return None - - reaction = rrCache._m_mnx_reaction_from_reac_prop( - reac_prop_file, rxn_id, logger=self.logger - ) - if reaction is None: - self.logger.warning( - f"Reaction {rxn_id} not found in local reac_prop.tsv" - ) - return None - - # reuse dict insertion path - return self.add_reaction({"id": rxn_id, **reaction}, persist=persist) - - self.logger.error("add_reaction expects a dict-like or string reaction id") - return None - @staticmethod def _m_mnx_reaction_from_reac_prop( reac_prop_path: str, @@ -454,139 +360,6 @@ def _m_mnx_reaction_from_reac_prop( def get_list_of_reactions(self): return self.__get_list_of_objects("template_reactions") - def add_missing_reactions(self, limit: int = None, logger: Logger = None) -> int: - """Scan `reac_prop.tsv` (download if missing) for reactions absent from - `template_reactions`, build reaction dicts, and add them via - `add_reaction` one by one. - - Args: - limit (int, optional): Maximum number of reactions to add. None means no limit. - logger (Logger, optional): Logger to use. Defaults to self.logger. - - Returns: - int: Number of reactions successfully added. - """ - if logger is None: - logger = self.logger - - # Ensure template_reactions loaded - try: - if not self.__hasattr("template_reactions"): - self.Load(attrs=["template_reactions"]) - tr = self.get("template_reactions") or {} - except Exception as e: - logger.error(f"Failed to load template_reactions: {e}") - return 0 - - # Ensure reac_prop.tsv exists (try configured download, then fallback) - reac_prop_file = os_path.join(self.__input__cache_dir, "reac_prop.tsv") - if not os_path.exists(reac_prop_file): - try: - mnx_source = rrCache.__cache_sources.get("mnx", {}) - reac_fingerprint = mnx_source.get("files", {}).get("reac_prop.tsv") - rrCache._download_if_not_exists_or_corrupted( - mnx_source.get("url", ""), - "reac_prop.tsv", - self.__input__cache_dir, - reac_fingerprint, - logger=logger, - ) - except Exception: - # fallback direct download using config URL - try: - mnx_source = rrCache.__cache_sources.get("mnx", {}) - # base_url = mnx_source.get("url", "https://www.metanetx.org/ftp/4.5/") - base_url = mnx_source.get("url") - fallback_url = f"{base_url}reac_prop.tsv" - logger.debug( - f"Downloading fallback reac_prop.tsv from {fallback_url}" - ) - r = requests.get(fallback_url, timeout=30) - r.raise_for_status() - if not os_path.isdir(self.__input__cache_dir): - makedirs(self.__input__cache_dir, exist_ok=True) - with open(reac_prop_file, "w", encoding="utf-8") as f: - f.write(r.text) - except Exception as e2: - logger.error(f"Cannot retrieve MetaNetX reac_prop.tsv: {e2}") - return 0 - - to_add = [] - try: - with open(reac_prop_file, "rt", encoding="utf-8-sig") as f: - reader = csv_reader(f, delimiter="\t") - header = None - for row in reader: - if not row or len(row) == 0: - continue - if row[0].startswith("#ID"): - header = [h.lstrip("#") for h in row] - continue - if row[0].startswith("#"): - continue - if header is None: - continue - - rxn_id = row[0] - if rxn_id in tr: - continue - - # Build row dict and parse equation - row_dict = { - header[i]: row[i] if i < len(row) else "" - for i in range(len(header)) - } - equation = row_dict.get("mnx_equation") or row_dict.get("equation") - if not equation: - logger.debug(f"Skipping {rxn_id}: no equation") - continue - - parsed = rrCache._read_equation(equation, rxn_id, logger) - if parsed is None: - logger.debug(f"Skipping {rxn_id}: failed to parse equation") - continue - - left = parsed.get("left", {}) - right = parsed.get("right", {}) - main_left = [next(iter(left.keys()))] if left else [] - main_right = [next(iter(right.keys()))] if right else [] - - to_add.append( - { - "id": rxn_id, - "left": left, - "right": right, - "direction": 0, - "main_left": main_left, - "main_right": main_right, - } - ) - if limit is not None and len(to_add) >= limit: - break - except Exception as e: - logger.error(f"Error while scanning reac_prop.tsv: {e}") - return 0 - - added = 0 - for rxn_dict in to_add: - res = self.add_reaction(rxn_dict, persist=False) - if res is not None: - added += 1 - - # Store cache to file once after all reactions are added - if added > 0: - try: - outfile = os_path.join( - self.__cache_dir, - rrCache.__cache["template_reactions"]["file"]["name"], - ) - tr = self.get("template_reactions") or {} - rrCache._store_cache_to_file(tr, outfile, logger=logger) - except Exception as e: - logger.warning(f"Failed to persist template_reactions cache: {e}") - - return added - def get_reaction_rule(self, rr_id: str): return self.__get_object("rr_reactions", rr_id) @@ -805,8 +578,6 @@ def Build(self, interactive: bool = DEFAULTS["interactive"]) -> None: logger=self.logger, ) # , deprecatedCID_cid, deprecatedRID_rid, logger) print_progress(self.logger) - # Add missing reactions from MetaNetX - self.add_missing_reactions(logger=self.logger) try: rrCache._gen_comp_xref_deprecatedCompID_compid( self.__input__cache_dir, self.__cache_dir, self.logger @@ -1062,50 +833,48 @@ def _gen_reactions( if os_path.exists(outfile) and check_sha(outfile, rrCache.__cache[_attribute]): logger.debug(" Cache file already exists") else: + dep_files = {} # Iterate over the file dependencies and find the corresponding files in the input cache sources for dep_file in rrCache.__cache[_attribute]["deps"]["file_deps"]: # Iterate over sources to look if the dependency file is listed in for scat, source in rrCache.__cache_sources.items(): + if scat not in dep_files: + dep_files[scat] = [] if dep_file in source["files"]: # Look is the file is listed for one of the databases to use, # i.e. if the value is the fingerprint or a dict if isinstance(source["files"][dep_file], dict): - _reactions = {} for db in source["files"][dep_file]: - if db not in databases: + if db in databases: + dep_files[scat].append( + os_path.join(input_dir, db, dep_file) + ) + else: logger.debug( f"Database {db} is not in the list of databases to include in the cache, skipping generation of reactions for {db}" ) continue - logger.debug(" Generating data...") - _dep_file = os_path.join(input_dir, db, dep_file) - _reactions = getattr( - rrCache, "_m_" + attribute + "_reactions" - )(_dep_file, logger=logger) - # Merge with existing reactions for existing keys (append as nested dict) - # Otherwise, for rules built on reactions from several databases, - # the last database in the loop will overwrite the previous ones instead of merging them - for rkey, rval in _reactions.items(): - if ( - rkey in reactions - and isinstance(reactions[rkey], dict) - and isinstance(rval, dict) - ): - reactions[rkey].update(rval) - else: - reactions[rkey] = rval - logger.debug(" Writing data to file...") - rrCache._store_cache_to_file( - reactions, outfile, logger=logger - ) else: - reactions = getattr( - rrCache, "_m_" + attribute + "_reactions_legacy" - )(dep_file, logger=logger) - logger.debug(" Writing data to file...") - rrCache._store_cache_to_file( - reactions, outfile, logger=logger - ) + dep_files[scat].append(os_path.join(input_dir, dep_file)) + + method_name = "_m_" + attribute + "_reactions" + + if type == "legacy": + method_name += "_legacy" + # Get 'rr2' if not empty otherwise 'rr2more' + if dep_files["rr2"] != []: + dep_files = dep_files["rr2"][0] + elif dep_files["rr2more"] != []: + dep_files = dep_files["rr2more"][0] + else: + logger.error( + "No reaction rule file found for legacy type, cannot generate reactions" + ) + return + + reactions = getattr(rrCache, method_name)(dep_files, logger=logger) + logger.debug(" Writing data to file...") + rrCache._store_cache_to_file(reactions, outfile, logger=logger) del reactions @@ -1652,69 +1421,79 @@ def _m_mnxc_xref( @staticmethod def _m_rr_reactions( - rules_rall_path: str, logger: Logger = getLogger(__name__) + rules_rall_paths: str, logger: Logger = getLogger(__name__) ) -> Dict: - logger.debug(f"Parsing rules from {rules_rall_path}") + logger.debug(f"Parsing rules from {rules_rall_paths}") + + _rules_rall_paths = rules_rall_paths["rr2"] rr_reactions = {} - if not os_path.exists(rules_rall_path): - logger.error("Could not read the rules file (" + str(rules_rall_path) + ")") - return None + for _rules_rall_path in _rules_rall_paths: - for row in csv_DictReader(gzip_open(rules_rall_path, "rt"), delimiter="\t"): - if row["TEMPLATE_ID"] not in rr_reactions: - rr_reactions[row["TEMPLATE_ID"]] = {} - if row["REACTION_ID"] not in rr_reactions[row["TEMPLATE_ID"]]: - subtrates = {row["LEFT_IDS"]: 1} - products = dict(Counter(row["RIGHT_IDS"].split("."))) - rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]] = { - "rule_id": row["TEMPLATE_ID"], - "rule_score": None if row["SCORE"] == "" else float(row["SCORE"]), - "reac_id": row["REACTION_ID"], - "subs_id": row["LEFT_IDS"], - "rel_direction": (1 if row["DIRECTION"] == "L2R" else -1), - "left": subtrates, - "right": products, - "left_excluded": ( - row["LEFT_EXCLUDED_IDS"].split(".") - if row["LEFT_EXCLUDED_IDS"] - else [] - ), - "right_excluded": ( - row["RIGHT_EXCLUDED_IDS"].split(".") - if row["RIGHT_EXCLUDED_IDS"] - else [] - ), - } - # Handle multiple reactions per rule, update direction if needed - else: - if ( - rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ - "rel_direction" - ] - != 0 - and (1 if row["DIRECTION"] == "L2R" else -1) - != rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ - "rel_direction" - ] - ): - logger.debug( - "Updating direction for reaction " - + str(row["REACTION_ID"]) - + " in rule " - + str(row["TEMPLATE_ID"]) - + " from " - + str( - rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ - "rel_direction" - ] + if not os_path.exists(_rules_rall_path): + logger.error( + "Could not read the rules file (" + str(_rules_rall_path) + ")" + ) + return None + + for row in csv_DictReader( + gzip_open(_rules_rall_path, "rt"), delimiter="\t" + ): + if row["TEMPLATE_ID"] not in rr_reactions: + rr_reactions[row["TEMPLATE_ID"]] = {} + if row["REACTION_ID"] not in rr_reactions[row["TEMPLATE_ID"]]: + subtrates = {row["LEFT_IDS"]: 1} + products = dict(Counter(row["RIGHT_IDS"].split("."))) + rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]] = { + "rule_id": row["TEMPLATE_ID"], + "rule_score": ( + None if row["SCORE"] == "" else float(row["SCORE"]) + ), + "reac_id": row["REACTION_ID"], + "subs_id": row["LEFT_IDS"], + "rel_direction": (1 if row["DIRECTION"] == "L2R" else -1), + "left": subtrates, + "right": products, + "left_excluded": ( + row["LEFT_EXCLUDED_IDS"].split(".") + if row["LEFT_EXCLUDED_IDS"] + else [] + ), + "right_excluded": ( + row["RIGHT_EXCLUDED_IDS"].split(".") + if row["RIGHT_EXCLUDED_IDS"] + else [] + ), + } + # Handle multiple reactions per rule, update direction if needed + else: + if ( + rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ + "rel_direction" + ] + != 0 + and (1 if row["DIRECTION"] == "L2R" else -1) + != rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ + "rel_direction" + ] + ): + logger.debug( + "Updating direction for reaction " + + str(row["REACTION_ID"]) + + " in rule " + + str(row["TEMPLATE_ID"]) + + " from " + + str( + rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ + "rel_direction" + ] + ) + + " to bidirectional (0)" ) - + " to bidirectional (0)" - ) - rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ - "rel_direction" - ] = 0 # bidirectional + rr_reactions[row["TEMPLATE_ID"]][row["REACTION_ID"]][ + "rel_direction" + ] = 0 # bidirectional return rr_reactions @@ -1787,64 +1566,127 @@ def _m_rr_reactions_legacy( @staticmethod def _m_template_reactions( - metadata_path: str, logger: Logger = getLogger(__name__) + paths: Dict[str, List[str]], logger: Logger = getLogger(__name__) ) -> Dict: - if not os_path.exists(metadata_path): - logger.error("Cannot find file: " + str(metadata_path)) - return None + metadata_paths = paths["rr2"] + reac_prop_path = ( + paths["mnx"][0] if "mnx" in paths and len(paths["mnx"]) > 0 else None + ) + + logger.debug(f"metadata_paths: {metadata_paths}") + logger.debug(f"reac_prop_path: {reac_prop_path}") reactions = {} - for row in csv_DictReader(gzip_open(metadata_path, "rt"), delimiter="\t"): - if row["REACTION_ID"] not in reactions: - # print(row) - substrates = dict( - Counter( - [row["LEFT_IDS"]] - + ( - row["LEFT_EXCLUDED_IDS"].split(".") - if row["LEFT_EXCLUDED_IDS"] - else [] + # Extract reaction data from rules metadata files + for metadata_path in metadata_paths: + + if not os_path.exists(metadata_path): + logger.error("Cannot find file: " + str(metadata_path)) + return None + + for row in csv_DictReader(gzip_open(metadata_path, "rt"), delimiter="\t"): + if row["REACTION_ID"] not in reactions: + substrates = dict( + Counter( + [row["LEFT_IDS"]] + + ( + row["LEFT_EXCLUDED_IDS"].split(".") + if row["LEFT_EXCLUDED_IDS"] + else [] + ) ) ) - ) - products = dict( - Counter( - row["RIGHT_IDS"].split(".") - + ( - row["RIGHT_EXCLUDED_IDS"].split(".") - if row["RIGHT_EXCLUDED_IDS"] - else [] + products = dict( + Counter( + row["RIGHT_IDS"].split(".") + + ( + row["RIGHT_EXCLUDED_IDS"].split(".") + if row["RIGHT_EXCLUDED_IDS"] + else [] + ) ) ) - ) - # if row['REACTION_ID'] == 'MNXR182203': - # print(substrates) - # print(products) - # exit() - main_left = row["LEFT_IDS"] - main_right = row["RIGHT_IDS"].split(".")[0] - if row["DIRECTION"] == "R2L": - # Swap left and right if direction is R2L - substrates, products = products, substrates - main_left, main_right = main_right, main_left - direction = -1 + main_left = row["LEFT_IDS"] + main_right = row["RIGHT_IDS"].split(".")[0] + if row["DIRECTION"] == "R2L": + # Swap left and right if direction is R2L + substrates, products = products, substrates + main_left, main_right = main_right, main_left + direction = -1 + else: + direction = 1 + reactions[row["REACTION_ID"]] = { + "left": substrates, + "right": products, + "direction": direction, + "main_left": main_left, + "main_right": main_right, + } + # Handle multiple reactions per rule, update direction if needed + elif row["DIRECTION"] != reactions[row["REACTION_ID"]]["direction"]: + reactions[row["REACTION_ID"]]["direction"] = 0 # bidirectional + + # Complete missing reactions from the reaction properties file (TSV) + # Ignore all lines starting with '#', the last one contains the header '#ID' and 'mnx_equation' + # Example of mnx_equation: 1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MNXM1102128@MNXD1 + 1 MNXM8415@MNXD1 + if reac_prop_path: + reac_prop_df = rrCache.__load_reactions_tsv(reac_prop_path) + for _, row in reac_prop_df.iterrows(): + reac_id = row["ID"] + if reac_id not in reactions: + rxn = rrCache._read_equation(row["mnx_equation"], reac_id, logger) + # check 'right' and 'left' are not empty + if reac_id not in reactions and rxn["left"] and rxn["right"]: + reactions[reac_id] = { + "left": rxn["left"], + "right": rxn["right"], + "direction": 0, # default to bidirectional if not specified + "main_left": "", + "main_right": "", + } else: - direction = 1 - reactions[row["REACTION_ID"]] = { - "left": substrates, - "right": products, - "direction": direction, - "main_left": main_left, - "main_right": main_right, - } - # Handle multiple reactions per rule, update direction if needed - elif row["DIRECTION"] != reactions[row["REACTION_ID"]]["direction"]: - reactions[row["REACTION_ID"]]["direction"] = 0 # bidirectional + logger.debug( + f"Reaction {reac_id} already in reactions, skipping equation parsing" + ) return reactions + @staticmethod + def __load_reactions_tsv( + path: str, logger: Logger = getLogger(__name__) + ) -> "DataFrame": + """ + Load a TSV file while: + - ignoring comment lines starting with '#' + - using the LAST commented line as the header + """ + + header = None + data_lines = [] + + with open(path, "r") as f: + for line in f: + line = line.rstrip("\n") + + if line.startswith("#"): + # Save last commented line as header + header = line[1:].split("\t") + else: + data_lines.append(line) + + if header is None: + raise ValueError("No header line found starting with '#'") + + # Rebuild TSV content without comments + tsv_content = "\n".join(data_lines) + + # Read with pandas + df = pd_read_csv(StringIO(tsv_content), sep="\t", names=header) + + return df + @staticmethod def _m_template_reactions_legacy( rxn_recipes_path: str, logger: Logger = getLogger(__name__) diff --git a/tests/data/metrics_rr2026.json b/tests/data/metrics_rr2026.json index 00090e6..1378aee 100644 --- a/tests/data/metrics_rr2026.json +++ b/tests/data/metrics_rr2026.json @@ -8,7 +8,7 @@ "file_size": 243759725 }, "template_reactions": { - "length": 95420, + "length": 95419, "file_size": 8519350 }, "rr_reactions": {