"""find_careers_page(): orchestrate the Stage 2 tier cascade. Cascade order (return early on first success): 1. ATS detection → ats.detect_and_fetch() confidence 0.95 2. URL patterns → heuristics.probe_url_patterns() 0.80 3. Homepage scan → heuristics.scan_homepage_links() 0.60 4. Sitemap → heuristics.parse_sitemap() 0.50 5. Cheap-LLM → classify_llm (stub, not implemented in this phase) 6. Browser agent → agent_fallback (stub, not implemented in this phase) Returns a CareersResult with the URL, confidence, method string, and — when the ATS tier resolves — the first open-position URL for free (Stage-3 shortcut). The optional *client* parameter follows the managed-client pattern from resolve.py: supply an existing httpx.Client to reuse connections; otherwise a short-lived client is created and closed here. """ from __future__ import annotations import logging import httpx from pydantic import BaseModel from ..http import build_client, request_with_retries from . import ats as _ats from . import heuristics as _heuristics logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Result model # --------------------------------------------------------------------------- class CareersResult(BaseModel): """Typed output of find_careers_page().""" careers_url: str | None = None confidence: float = 0.0 # method values: "ats:{name}" | "url_pattern" | "homepage_scan" | "sitemap" | "none" method: str = "none" ats_name: str | None = None # Free Stage-3 shortcut: populated when ATS tier resolves (first open job URL). position_url: str | None = None # --------------------------------------------------------------------------- # Public entry point # --------------------------------------------------------------------------- def find_careers_page( website: str, *, company_name: str | None = None, client: httpx.Client | None = None, ) -> CareersResult: """Run the careers-page discovery cascade for one company website. *company_name* is optional; when provided it supplies a second slug candidate for the Tier 1b slug-guess recovery and enables the org-name cross-check. Returns a CareersResult. Never raises — tier failures fall through gracefully. """ _managed = client is None if _managed: client = build_client() try: # Fetch the homepage once; shared by ATS detection and homepage-link scan. homepage_html: str | None = _safe_get_html(website, client) # ------------------------------------------------------------------ # Tier 1 — ATS detection + public JSON API # ------------------------------------------------------------------ try: ats_result = _ats.detect_and_fetch( website, client, homepage_html=homepage_html ) if ats_result is not None: logger.info( "cascade(%s): resolved via ats:%s careers_url=%s", website, ats_result.ats_name, ats_result.careers_url, ) return CareersResult( careers_url=ats_result.careers_url, confidence=0.95, method=f"ats:{ats_result.ats_name}", ats_name=ats_result.ats_name, position_url=ats_result.position_url, ) except Exception as exc: logger.warning("cascade(%s): ats tier error: %s", website, exc) # ------------------------------------------------------------------ # Tier 1b — Slug-guess ATS recovery (JS-embedded / SPA boards) # ------------------------------------------------------------------ try: rec = _ats.recover_via_slug_guess(website, company_name, client) if rec is not None: logger.info( "cascade(%s): resolved via ats:%s:slug_guess careers_url=%s", website, rec.ats_name, rec.careers_url, ) return CareersResult( careers_url=rec.careers_url, confidence=0.90, method=f"ats:{rec.ats_name}:slug_guess", ats_name=rec.ats_name, position_url=rec.position_url, ) except Exception as exc: logger.warning("cascade(%s): slug_guess tier error: %s", website, exc) # ------------------------------------------------------------------ # Tier 2 — URL-pattern probing # ------------------------------------------------------------------ try: url = _heuristics.probe_url_patterns(website, client) if url: return _finalize(url, "url_pattern", 0.80, website, client) except Exception as exc: logger.warning("cascade(%s): url_pattern tier error: %s", website, exc) # ------------------------------------------------------------------ # Tier 3 — Homepage link scan (reuse fetched HTML) # ------------------------------------------------------------------ try: url = _heuristics.scan_homepage_links( website, client, homepage_html=homepage_html ) if url: return _finalize(url, "homepage_scan", 0.60, website, client) except Exception as exc: logger.warning("cascade(%s): homepage_scan tier error: %s", website, exc) # ------------------------------------------------------------------ # Tier 4 — Sitemap # ------------------------------------------------------------------ try: url = _heuristics.parse_sitemap(website, client) if url: return _finalize(url, "sitemap", 0.50, website, client) except Exception as exc: logger.warning("cascade(%s): sitemap tier error: %s", website, exc) # All deterministic tiers missed. logger.info("cascade(%s): all deterministic tiers missed", website) return CareersResult() finally: if _managed: client.close() # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _detect_ats_in_page(url: str, client: httpx.Client) -> "_ats.ATSBoard | None": """Fetch a page and run ATS detection on its HTML; return ATSBoard or None.""" try: resp = request_with_retries(client, "GET", url, max_retries=0) if resp.status_code < 400: return _ats.detect_ats_in_html(resp.text) except Exception: pass return None def _safe_get_html(website: str, client: httpx.Client) -> str | None: """Best-effort homepage fetch; return HTML text or None on any failure.""" try: resp = request_with_retries(client, "GET", website, max_retries=1) if resp.status_code < 400: return resp.text logger.info("cascade: homepage GET %s returned HTTP %s", website, resp.status_code) return None except Exception as exc: logger.info("cascade: homepage GET %s error: %s", website, exc) return None def _finalize( url: str, method: str, confidence: float, website: str, client: httpx.Client, ) -> CareersResult: """Attempt ATS-URL upgrade for heuristic hits; return a CareersResult. If the URL resolved by a heuristic tier is itself an ATS board page (e.g. jobs.lever.co/acme), detect and fetch it so we can return a position_url and upgrade the confidence to 0.95. """ board = _ats.detect_ats_in_url(url) if board is None: # The URL itself isn't an ATS board link; fetch the page and check its HTML. # This catches companies like Vercel/Figma whose Greenhouse embed is only on /careers. board = _detect_ats_in_page(url, client) if board is not None: try: fetch_fn = _ats._FETCH_DISPATCH.get(board.ats_name) if fetch_fn is not None: fetch = fetch_fn(board, client) # type: ignore[operator] upgraded_method = f"ats:{board.ats_name}" logger.info( "cascade(%s): %s hit upgraded to %s careers_url=%s", website, method, upgraded_method, board.careers_url, ) return CareersResult( careers_url=board.careers_url, confidence=0.95, method=upgraded_method, ats_name=board.ats_name, position_url=fetch.first_url, ) except Exception as exc: logger.warning( "cascade(%s): ats upgrade for %s failed: %s", website, method, exc ) logger.info( "cascade(%s): resolved via %s careers_url=%s confidence=%.2f", website, method, url, confidence, ) return CareersResult( careers_url=url, confidence=confidence, method=method, )