"""ATS detection and public JSON API fetching (Stage 2, tier 1). Detects Greenhouse / Lever / Ashby / Workday from a company homepage's HTML or from a URL string, then calls each platform's public (no-auth) JSON API to return both a canonical careers-page URL and the first open-position URL. Live-verified API shapes (2026-06-17): Greenhouse: GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs?per_page=1 → {"jobs":[{"absolute_url":"...","company_name":"..."},...], "meta":{"total":N}} Lever: GET https://api.lever.co/v0/postings/{slug}?mode=json&limit=1 → [{"hostedUrl":"..."},...] (JSON array, empty list if no slug) Ashby: GET https://api.ashbyhq.com/posting-api/job-board/{slug} → {"jobs":[{"jobUrl":"..."},...], "apiVersion":"..."} NOTE: GET only — POST is NOT used (CLAUDE.md gotcha was wrong) Workday: POST https://{host}/wday/cxs/{tenant}/{site}/jobs body {"appliedFacets":{},"limit":1,"offset":0,"searchText":""} → {"total":N,"jobPostings":[{"externalPath":"/job/..."},...]} Job URL: https://{host}/en-US/{site}{externalPath} """ from __future__ import annotations import logging import re from urllib.parse import urlsplit import httpx from pydantic import BaseModel from ..http import build_client, request_with_retries from ..resolve import _slug logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Detection patterns — keyed by ATS name, applied to raw HTML or URL strings. # Each pattern yields named groups used to build board coordinates. # --------------------------------------------------------------------------- # Greenhouse: match board embed script or iframe src or direct board links. # Captures: slug _GH_PATTERN = re.compile( r"(?:" r"boards\.greenhouse\.io/(?:embed/job_board\?for=|)(?P[A-Za-z0-9_-]+)" r"|job-boards\.greenhouse\.io/(?P[A-Za-z0-9_-]+)" r"|greenhouse\.io/embed/job_board\?for=(?P[A-Za-z0-9_-]+)" r")", re.IGNORECASE, ) # Lever: board is always jobs.lever.co/{slug} # Captures: slug _LEVER_PATTERN = re.compile( r"jobs\.lever\.co/(?P[A-Za-z0-9_-]+)", re.IGNORECASE, ) # Ashby: board is always jobs.ashbyhq.com/{slug}; slugs may be mixed-case # Captures: slug _ASHBY_PATTERN = re.compile( r"jobs\.ashbyhq\.com/(?P[A-Za-z0-9_%-]+)", re.IGNORECASE, ) # Workday: tenant.wdN.myworkdayjobs.com[/locale]/site # Captures: tenant, wdnum (the N in wdN), site # Locale portion (e.g. "en-US/") is optional and consumed but not captured. _WORKDAY_PATTERN = re.compile( r"(?P[A-Za-z0-9_-]+)\.wd(?P\d+)\.myworkdayjobs\.com" r"(?:/[a-z]{2}-[A-Z]{2})?" # optional locale segment r"/(?P[A-Za-z0-9_%-]+)", re.IGNORECASE, ) # Ordered list used by both detect_ats_in_html and detect_ats_in_url _ATS_CHECKS: list[tuple[str, re.Pattern[str]]] = [ ("greenhouse", _GH_PATTERN), ("lever", _LEVER_PATTERN), ("ashby", _ASHBY_PATTERN), ("workday", _WORKDAY_PATTERN), ] # --------------------------------------------------------------------------- # Data models # --------------------------------------------------------------------------- class ATSBoard(BaseModel): """Coordinates for a detected ATS board (detection output).""" ats_name: str # greenhouse | lever | ashby | workday slug: str # board slug / company identifier careers_url: str # canonical human-readable careers board URL wd_host: str | None = None # workday only: full myworkdayjobs.com host wd_tenant: str | None = None # workday only: tenant portion of host wd_site: str | None = None # workday only: site path segment class ATSFetch(BaseModel): """Per-platform API fetch result; single source of truth for response field shapes.""" first_url: str | None = None # first open-position URL from the board job_count: int = 0 # total jobs reported by the board org_name: str | None = None # company_name from the API (Greenhouse only) class ATSResult(BaseModel): """Output of detect_and_fetch / recover_via_slug_guess: careers URL + optional first-job URL.""" ats_name: str careers_url: str position_url: str | None = None job_count: int = 0 # --------------------------------------------------------------------------- # Detection # --------------------------------------------------------------------------- def detect_ats_in_html(html: str) -> ATSBoard | None: """Scan page HTML for known ATS signals; return the first match or None. Tries patterns in order: Greenhouse → Lever → Ashby → Workday. Pure function; no network calls. """ for ats_name, pattern in _ATS_CHECKS: m = pattern.search(html) if m: board = _build_board(ats_name, m) if board: return board return None def detect_ats_in_url(url: str) -> ATSBoard | None: """Scan a single URL string for ATS board coordinates; return first match or None. Used by the cascade finalizer to recognise when a heuristic-found careers link is itself an ATS board (e.g. jobs.lever.co/acme). Pure function; no network calls. """ for ats_name, pattern in _ATS_CHECKS: m = pattern.search(url) if m: board = _build_board(ats_name, m) if board: return board return None def _build_board(ats_name: str, m: re.Match[str]) -> ATSBoard | None: """Construct an ATSBoard from a regex match; return None if slug is empty.""" if ats_name == "greenhouse": slug = m.group("slug1") or m.group("slug2") or m.group("slug3") or "" slug = slug.strip() if not slug: return None return ATSBoard( ats_name="greenhouse", slug=slug, careers_url=f"https://boards.greenhouse.io/{slug}", ) if ats_name == "lever": slug = (m.group("slug") or "").strip() if not slug: return None return ATSBoard( ats_name="lever", slug=slug, careers_url=f"https://jobs.lever.co/{slug}", ) if ats_name == "ashby": slug = (m.group("slug") or "").strip() if not slug: return None return ATSBoard( ats_name="ashby", slug=slug, careers_url=f"https://jobs.ashbyhq.com/{slug}", ) if ats_name == "workday": tenant = (m.group("tenant") or "").strip() wdnum = (m.group("wdnum") or "").strip() site = (m.group("site") or "").strip() if not (tenant and wdnum and site): return None host = f"{tenant}.wd{wdnum}.myworkdayjobs.com" return ATSBoard( ats_name="workday", slug=f"{tenant}/{site}", careers_url=f"https://{host}/en-US/{site}", wd_host=host, wd_tenant=tenant, wd_site=site, ) return None # pragma: no cover # --------------------------------------------------------------------------- # Per-ATS fetch functions — each returns ATSFetch # --------------------------------------------------------------------------- def _fetch_greenhouse(board: ATSBoard, client: httpx.Client) -> ATSFetch: """Call Greenhouse boards API; return ATSFetch with first_url, job_count, org_name.""" url = f"https://boards-api.greenhouse.io/v1/boards/{board.slug}/jobs?per_page=1" try: resp = request_with_retries(client, "GET", url, max_retries=2) if resp.status_code != 200: logger.warning("greenhouse(%s): HTTP %s", board.slug, resp.status_code) return ATSFetch() data = resp.json() jobs = data.get("jobs") or [] count = (data.get("meta") or {}).get("total", len(jobs)) first_url = jobs[0].get("absolute_url") if jobs else None org_name = jobs[0].get("company_name") if jobs else None return ATSFetch(first_url=first_url, job_count=count, org_name=org_name) except Exception as exc: logger.warning("greenhouse(%s): fetch error: %s", board.slug, exc) return ATSFetch() def _fetch_lever(board: ATSBoard, client: httpx.Client) -> ATSFetch: """Call Lever postings API; return ATSFetch with first_url and job_count.""" url = f"https://api.lever.co/v0/postings/{board.slug}?mode=json&limit=1" try: resp = request_with_retries(client, "GET", url, max_retries=2) if resp.status_code != 200: logger.warning("lever(%s): HTTP %s", board.slug, resp.status_code) return ATSFetch() data = resp.json() if not isinstance(data, list): logger.warning("lever(%s): unexpected response type %s", board.slug, type(data)) return ATSFetch() first_url = data[0].get("hostedUrl") if data else None return ATSFetch(first_url=first_url, job_count=len(data)) except Exception as exc: logger.warning("lever(%s): fetch error: %s", board.slug, exc) return ATSFetch() def _fetch_ashby(board: ATSBoard, client: httpx.Client) -> ATSFetch: """Call Ashby job-board API (GET); return ATSFetch with first_url and job_count.""" url = f"https://api.ashbyhq.com/posting-api/job-board/{board.slug}" try: resp = request_with_retries(client, "GET", url, max_retries=2) if resp.status_code != 200: logger.warning("ashby(%s): HTTP %s", board.slug, resp.status_code) return ATSFetch() data = resp.json() jobs = data.get("jobs") or [] first_url = jobs[0].get("jobUrl") if jobs else None return ATSFetch(first_url=first_url, job_count=len(jobs)) except Exception as exc: logger.warning("ashby(%s): fetch error: %s", board.slug, exc) return ATSFetch() def _fetch_workday(board: ATSBoard, client: httpx.Client) -> ATSFetch: """Call Workday CXS jobs endpoint (POST); return ATSFetch with first_url and total count.""" if not (board.wd_host and board.wd_tenant and board.wd_site): return ATSFetch() api_url = ( f"https://{board.wd_host}/wday/cxs/{board.wd_tenant}/{board.wd_site}/jobs" ) payload = {"appliedFacets": {}, "limit": 1, "offset": 0, "searchText": ""} try: resp = request_with_retries( client, "POST", api_url, json=payload, headers={"Content-Type": "application/json"}, max_retries=2, ) if resp.status_code != 200: logger.warning("workday(%s): HTTP %s", board.slug, resp.status_code) return ATSFetch() data = resp.json() total = data.get("total", 0) postings = data.get("jobPostings") or [] first_url: str | None = None if postings: ext_path = postings[0].get("externalPath") or "" if ext_path: first_url = ( f"https://{board.wd_host}/en-US/{board.wd_site}{ext_path}" ) return ATSFetch(first_url=first_url, job_count=total) except Exception as exc: logger.warning("workday(%s): fetch error: %s", board.slug, exc) return ATSFetch() _FETCH_DISPATCH: dict[str, object] = { "greenhouse": _fetch_greenhouse, "lever": _fetch_lever, "ashby": _fetch_ashby, "workday": _fetch_workday, } # --------------------------------------------------------------------------- # Public orchestrator — HTML-detection path (Tier 1) # --------------------------------------------------------------------------- def detect_and_fetch( website: str, client: httpx.Client, *, homepage_html: str | None = None, ) -> ATSResult | None: """Detect the ATS for a company website and fetch the first job via its API. If *homepage_html* is provided it is used directly (avoids a redundant GET). Returns an ATSResult on success (position_url may be None if the board has no live jobs), or None if no ATS was detected or the homepage fetch failed. """ html = homepage_html if html is None: try: resp = request_with_retries(client, "GET", website, max_retries=1) if resp.status_code < 400: html = resp.text except Exception as exc: logger.warning("ats detect_and_fetch(%s): homepage fetch error: %s", website, exc) return None if not html: return None board = detect_ats_in_html(html) if board is None: return None logger.info( "ats(%s): detected %s slug=%s careers_url=%s", website, board.ats_name, board.slug, board.careers_url, ) fetch_fn = _FETCH_DISPATCH.get(board.ats_name) if fetch_fn is None: # pragma: no cover return ATSResult(ats_name=board.ats_name, careers_url=board.careers_url) fetch = fetch_fn(board, client) # type: ignore[operator] logger.info( "ats(%s): %s board has %s jobs; first_url=%s", website, board.ats_name, fetch.job_count, fetch.first_url, ) return ATSResult( ats_name=board.ats_name, careers_url=board.careers_url, position_url=fetch.first_url, job_count=fetch.job_count, ) # --------------------------------------------------------------------------- # Slug-guess recovery helpers — Tier 1b (JS-embedded / SPA boards) # --------------------------------------------------------------------------- def _domain_stem(website: str) -> str | None: """Extract the first DNS label (lowercased) from a website URL, stripping www. Example: 'https://www.anthropic.com' → 'anthropic' """ try: parts = urlsplit(website) host = parts.netloc or parts.path.split("/")[0] host = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE).lower() stem = host.split(".")[0] return stem or None except Exception: return None def _slug_candidates(website: str, company_name: str | None) -> list[str]: """Return ordered, deduped ATS slug candidates (≤ 3) for a company. Order: domain stem first (most specific/unique), then normalized company name. """ candidates: list[str] = [] seen: set[str] = set() stem = _domain_stem(website) if stem and stem not in seen: candidates.append(stem) seen.add(stem) if company_name: name_slug = _slug(company_name) if name_slug and name_slug not in seen: candidates.append(name_slug) seen.add(name_slug) return candidates[:3] def _board_from_slug(ats_name: str, slug: str) -> ATSBoard: """Construct an ATSBoard for Greenhouse / Lever / Ashby slug-guess probing.""" if ats_name == "greenhouse": return ATSBoard( ats_name="greenhouse", slug=slug, careers_url=f"https://boards.greenhouse.io/{slug}", ) if ats_name == "lever": return ATSBoard( ats_name="lever", slug=slug, careers_url=f"https://jobs.lever.co/{slug}", ) # ashby return ATSBoard( ats_name="ashby", slug=slug, careers_url=f"https://jobs.ashbyhq.com/{slug}", ) def _loose_name_match(input_name: str, org_name: str) -> bool: """Return True if company names loosely match (slug of one is a substring of the other). If either side normalizes to empty, returns True (unverifiable → don't reject). """ a = _slug(input_name) or "" b = _slug(org_name) or "" if not a or not b: return True return a in b or b in a # Platforms probed in slug-guess recovery (Workday excluded — needs tenant+site). _SLUG_GUESS_PLATFORMS = ("greenhouse", "lever", "ashby") def recover_via_slug_guess( website: str, company_name: str | None, client: httpx.Client, ) -> ATSResult | None: """Probe Greenhouse/Lever/Ashby APIs with guessed slugs when HTML detection misses. Used as Tier 1b in the cascade — catches companies whose ATS board is injected via client-side JS and therefore invisible to static HTML detection (e.g. Anthropic's Greenhouse board rendered by Next.js). Slug candidates: domain stem first (e.g. anthropic.com → 'anthropic'), then the normalized company name. First board with job_count > 0 wins. False-positive guard: if Greenhouse returns a company_name, it is loosely cross-checked against the input company_name; a clear mismatch is rejected. Returns ATSResult on success or None on all-miss. """ candidates = _slug_candidates(website, company_name) if not candidates: return None for slug in candidates: for ats_name in _SLUG_GUESS_PLATFORMS: board = _board_from_slug(ats_name, slug) fetch_fn = _FETCH_DISPATCH.get(ats_name) if fetch_fn is None: # pragma: no cover continue try: fetch = fetch_fn(board, client) # type: ignore[operator] except Exception as exc: logger.debug( "recover_via_slug_guess(%s/%s/%s): error: %s", website, ats_name, slug, exc, ) continue if not fetch.first_url or fetch.job_count == 0: continue # Cross-check org name if the platform provides it (Greenhouse only). if company_name and fetch.org_name: if not _loose_name_match(company_name, fetch.org_name): logger.info( "recover_via_slug_guess(%s): %s slug=%s org_name=%r " "does not match input %r — skipping", website, ats_name, slug, fetch.org_name, company_name, ) continue logger.info( "recover_via_slug_guess(%s): hit %s slug=%s jobs=%s careers_url=%s", website, ats_name, slug, fetch.job_count, board.careers_url, ) return ATSResult( ats_name=ats_name, careers_url=board.careers_url, position_url=fetch.first_url, job_count=fetch.job_count, ) logger.info("recover_via_slug_guess(%s): all candidates missed", website) return None