-
Notifications
You must be signed in to change notification settings - Fork 0
Add closed shadow DOM capture via CDP #136
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
46547ec
b0a22e1
0bd2fc6
53a68c0
ec86c9d
bda958c
d1db8b4
a559f86
ddce23a
e6e2481
e7e8626
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -96,6 +96,146 @@ def fetch_percy_dom(): | |
| return response.text | ||
|
|
||
|
|
||
| def _get_origin(url): | ||
| """Return scheme://host:port for a URL, or '' if it can't be parsed.""" | ||
| if not url: | ||
| return "" | ||
| try: | ||
| parsed = urlparse(url) | ||
| if not parsed.scheme or not parsed.netloc: | ||
| return "" | ||
| return f"{parsed.scheme}://{parsed.netloc}" | ||
| except Exception: # pragma: no cover | ||
| return "" | ||
|
|
||
|
|
||
| def _same_origin(url, page_origin): | ||
| """True when `url`'s origin (scheme + host + port) matches the page origin.""" | ||
| if not page_origin: | ||
| return False | ||
| return _get_origin(url) == page_origin | ||
|
|
||
|
|
||
| def _walk_nodes(node, closed_pairs, page_origin=""): | ||
| """Walk CDP DOM tree to find closed shadow roots. | ||
|
|
||
| Same-origin child frame documents share the parent's JS realm and the | ||
| `window.__percyClosedShadowRoots` WeakMap that PercyDOM.serialize reads, | ||
| so we recurse INTO them. Cross-origin frames live in a different realm | ||
| (their resolveNode objectIds wouldn't belong to our execution context), | ||
| so they're skipped. A contentDocument with no resolvable origin is also | ||
| skipped defensively.""" | ||
| if "contentDocument" in node: | ||
| content_doc = node["contentDocument"] | ||
| document_url = content_doc.get("documentURL") | ||
| if not document_url: | ||
| return | ||
| if not _same_origin(document_url, page_origin): | ||
| log( | ||
| "Skipping cross-origin frame document during" | ||
| f" closed-shadow walk: {document_url}", | ||
| lvl="debug" | ||
| ) | ||
| return | ||
| # Same-origin frame: walk into the contentDocument as if it were any | ||
| # other subtree, then continue with this node's own children below. | ||
| _walk_nodes(content_doc, closed_pairs, page_origin) | ||
| if "shadowRoots" in node: | ||
| for sr in node["shadowRoots"]: | ||
| if sr.get("shadowRootType") == "closed": | ||
| closed_pairs.append({ | ||
| "hostBackendNodeId": node["backendNodeId"], | ||
| "shadowBackendNodeId": sr["backendNodeId"] | ||
| }) | ||
| _walk_nodes(sr, closed_pairs, page_origin) | ||
| if "children" in node: | ||
| for child in node["children"]: | ||
| _walk_nodes(child, closed_pairs, page_origin) | ||
|
|
||
|
|
||
| # pylint: disable=too-many-locals | ||
| def expose_closed_shadow_roots(page): | ||
| """Use CDP to discover closed shadow roots and expose them to PercyDOM.serialize(). | ||
| Closed shadow roots are inaccessible from JS (element.shadowRoot === null), | ||
| but CDP's DOM domain can pierce them.""" | ||
| cdp_session = None | ||
| dom_enabled = False | ||
| try: | ||
| cdp_session = page.context.new_cdp_session(page) | ||
|
|
||
| cdp_session.send("DOM.enable") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. HIGH — selenium-dotnet #436 and selenium-ruby #32 both pair
Suggested fix: dom_enabled = False
try:
cdp_session.send("DOM.enable")
dom_enabled = True
# ... existing walk + resolve + register ...
finally:
if dom_enabled:
try:
cdp_session.send("DOM.disable")
except Exception:
pass
try:
cdp_session.detach()
except Exception:
passAlso add a test mirroring dotnet's |
||
| dom_enabled = True | ||
| doc_result = cdp_session.send( | ||
| "DOM.getDocument", {"depth": -1, "pierce": True} | ||
| ) | ||
| root = doc_result["root"] | ||
|
|
||
| # Compute the top-level page origin once so the walker can recurse | ||
| # into same-origin child frame documents but skip cross-origin ones. | ||
| page_origin = _get_origin(page.url) | ||
|
|
||
| closed_pairs = [] | ||
| _walk_nodes(root, closed_pairs, page_origin) | ||
|
|
||
| if not closed_pairs: | ||
| return | ||
|
|
||
| log( | ||
| f"Found {len(closed_pairs)} closed shadow root(s)," | ||
| " exposing via CDP", | ||
| lvl="debug" | ||
| ) | ||
|
|
||
| weakmap_script = ( | ||
| "() => { window.__percyClosedShadowRoots =" | ||
| " window.__percyClosedShadowRoots || new WeakMap(); }" | ||
| ) | ||
| page.evaluate(weakmap_script) | ||
|
|
||
| fn_decl = ( | ||
| "function(shadowRoot) {" | ||
| " window.__percyClosedShadowRoots" | ||
| ".set(this, shadowRoot); }" | ||
| ) | ||
| for pair in closed_pairs: | ||
| host_id = pair["hostBackendNodeId"] | ||
| host_result = cdp_session.send( | ||
| "DOM.resolveNode", {"backendNodeId": host_id} | ||
| ) | ||
| host_object_id = host_result["object"]["objectId"] | ||
|
|
||
| shadow_id = pair["shadowBackendNodeId"] | ||
| shadow_result = cdp_session.send( | ||
| "DOM.resolveNode", {"backendNodeId": shadow_id} | ||
| ) | ||
| shadow_object_id = shadow_result["object"]["objectId"] | ||
|
|
||
| cdp_session.send("Runtime.callFunctionOn", { | ||
| "functionDeclaration": fn_decl, | ||
| "objectId": host_object_id, | ||
| "arguments": [{"objectId": shadow_object_id}] | ||
| }) | ||
| except Exception as err: | ||
| log( | ||
| f"Could not expose closed shadow roots via CDP: {err}", | ||
| lvl="debug" | ||
| ) | ||
| finally: | ||
| # Release the DOM domain so subsequent CDP commands don't keep | ||
| # emitting DOM events for this session. Only sent when DOM.enable | ||
| # succeeded — a failing enable must not emit a spurious disable. | ||
| if dom_enabled: | ||
| try: | ||
| cdp_session.send("DOM.disable") | ||
| except Exception: # pragma: no cover | ||
| pass | ||
| if cdp_session: # pragma: no branch | ||
| try: | ||
| cdp_session.detach() | ||
| except Exception: # pragma: no cover | ||
| pass | ||
|
|
||
|
|
||
| def process_frame(page, frame, options, percy_dom_script): | ||
| """ | ||
| Processes a single cross-origin frame to capture its snapshot and resources. | ||
|
|
@@ -439,6 +579,9 @@ def capture_responsive_dom(page, cookies, percy_dom_script=None, config=None, ** | |
| if PERCY_RESPONSIVE_CAPTURE_RELOAD_PAGE: | ||
| page.reload() | ||
| page.evaluate(percy_dom_script) | ||
| # Re-prime the closed-shadow-root WeakMap — page.reload() creates a | ||
| # new document and erases window.__percyClosedShadowRoots. | ||
| expose_closed_shadow_roots(page) | ||
| page.evaluate("PercyDOM.waitForResize()") | ||
| resize_count = 0 | ||
|
|
||
|
|
@@ -492,6 +635,10 @@ def percy_snapshot(page, name, **kwargs): | |
| # Inject the DOM serialization script | ||
| percy_dom_script = fetch_percy_dom() | ||
| page.evaluate(percy_dom_script) | ||
|
|
||
| # Expose closed shadow roots via CDP before serialization | ||
| expose_closed_shadow_roots(page) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MEDIUM — Responsive captures reload the page at each width and re-evaluate selenium-ruby #32 explicitly re-primes after refresh in its responsive snapshot loop ( Add a unit test that drives the responsive capture path through |
||
|
|
||
| cookies = page.context.cookies() | ||
|
|
||
| # Serialize and capture the DOM | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
HIGH — Walker skips ALL iframes; selenium-dotnet #436 had the same initial bug and was fixed on CE re-review.
Same-origin iframes share the parent JS realm and the
window.__percyClosedShadowRootsWeakMap thatPercyDOM.serializereads. Closed shadow roots inside same-origin iframes SHOULD be captured. Cross-origin iframes should still be skipped (different JS realm;resolveNodeobjectIds wouldn't belong to our execution context anyway).selenium-dotnet's fix (commit
fix(snapshot): recurse closed shadow walk into same-origin iframes):GetOrigin(documentURL) == pageOrigin) → recurse INTOcontentDocument.documentURL→ defensive skip.Suggested Python port:
_same_originshould useurllib.parse.urlparseto compare scheme + host + port. Add tests for all three branches (same-origin captured, cross-origin skipped, missing URL skipped) — mirroring the dotnet test suite added in that PR.