phase2-ATS + heuristic careers finding

2026-06-17 17:33:11 -04:00
parent cd9ab9b95e
commit 113a4ced36
11 changed files with 2836 additions and 39 deletions
--- a/jobsource/careers/init.py
+++ b/jobsource/careers/init.py
@@ -1 +1,4 @@
 """Careers page discovery sub-package (Stage 2 cascade)."""
+from .cascade import CareersResult, find_careers_page
+
+__all__ = ["find_careers_page", "CareersResult"]
--- a/jobsource/careers/ats.py
+++ b/jobsource/careers/ats.py
@@ -1,17 +1,506 @@
 """ATS detection and public JSON API fetching (Stage 2, tier 1).

-Scaffold stub -- not implemented yet.
+Detects Greenhouse / Lever / Ashby / Workday from a company homepage's HTML or
+from a URL string, then calls each platform's public (no-auth) JSON API to
+return both a canonical careers-page URL and the first open-position URL.
+
+Live-verified API shapes (2026-06-17):
+  Greenhouse: GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs?per_page=1
+              → {"jobs":[{"absolute_url":"...","company_name":"..."},...], "meta":{"total":N}}
+  Lever:      GET https://api.lever.co/v0/postings/{slug}?mode=json&limit=1
+              → [{"hostedUrl":"..."},...] (JSON array, empty list if no slug)
+  Ashby:      GET https://api.ashbyhq.com/posting-api/job-board/{slug}
+              → {"jobs":[{"jobUrl":"..."},...], "apiVersion":"..."}
+              NOTE: GET only — POST is NOT used (CLAUDE.md gotcha was wrong)
+  Workday:    POST https://{host}/wday/cxs/{tenant}/{site}/jobs
+              body {"appliedFacets":{},"limit":1,"offset":0,"searchText":""}
+              → {"total":N,"jobPostings":[{"externalPath":"/job/..."},...]}
+              Job URL: https://{host}/en-US/{site}{externalPath}
 """
-# TODO (Stage 2, tier 1): implement per CLAUDE.md "Stage 2 — ATS detection".
-# Detect Greenhouse / Lever / Ashby / Workday from the company website, then call
-# their public JSON APIs (no login needed). On success, return both the careers page URL
-# AND the first job posting URL (so Stage 3 can skip its own cascade for ATS companies).
-#
-# Confirmed ATS JSON field shapes (verify live before trusting — see CLAUDE.md Gotchas):
-#   Greenhouse: GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs
-#               → {"jobs": [{"absolute_url": "...", ...}, ...]}
-#   Lever:      GET https://api.lever.co/v0/postings/{company}?mode=json
-#               → [{"hostedUrl": "...", ...}, ...]
-#   Ashby:      POST https://api.ashbyhq.com/posting-api/job-board/{slug}
-#               → {"jobs": [{"jobUrl": "...", ...}, ...]}
-#   Workday:    varies by tenant — needs per-tenant discovery logic
+from __future__ import annotations
+
+import logging
+import re
+from urllib.parse import urlsplit
+
+import httpx
+from pydantic import BaseModel
+
+from ..http import build_client, request_with_retries
+from ..resolve import _slug
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Detection patterns — keyed by ATS name, applied to raw HTML or URL strings.
+# Each pattern yields named groups used to build board coordinates.
+# ---------------------------------------------------------------------------
+
+# Greenhouse: match board embed script or iframe src or direct board links.
+# Captures: slug
+_GH_PATTERN = re.compile(
+    r"(?:"
+    r"boards\.greenhouse\.io/(?:embed/job_board\?for=|)(?P<slug1>[A-Za-z0-9_-]+)"
+    r"|job-boards\.greenhouse\.io/(?P<slug2>[A-Za-z0-9_-]+)"
+    r"|greenhouse\.io/embed/job_board\?for=(?P<slug3>[A-Za-z0-9_-]+)"
+    r")",
+    re.IGNORECASE,
+)
+
+# Lever: board is always jobs.lever.co/{slug}
+# Captures: slug
+_LEVER_PATTERN = re.compile(
+    r"jobs\.lever\.co/(?P<slug>[A-Za-z0-9_-]+)",
+    re.IGNORECASE,
+)
+
+# Ashby: board is always jobs.ashbyhq.com/{slug}; slugs may be mixed-case
+# Captures: slug
+_ASHBY_PATTERN = re.compile(
+    r"jobs\.ashbyhq\.com/(?P<slug>[A-Za-z0-9_%-]+)",
+    re.IGNORECASE,
+)
+
+# Workday: tenant.wdN.myworkdayjobs.com[/locale]/site
+# Captures: tenant, wdnum (the N in wdN), site
+# Locale portion (e.g. "en-US/") is optional and consumed but not captured.
+_WORKDAY_PATTERN = re.compile(
+    r"(?P<tenant>[A-Za-z0-9_-]+)\.wd(?P<wdnum>\d+)\.myworkdayjobs\.com"
+    r"(?:/[a-z]{2}-[A-Z]{2})?"          # optional locale segment
+    r"/(?P<site>[A-Za-z0-9_%-]+)",
+    re.IGNORECASE,
+)
+
+# Ordered list used by both detect_ats_in_html and detect_ats_in_url
+_ATS_CHECKS: list[tuple[str, re.Pattern[str]]] = [
+    ("greenhouse", _GH_PATTERN),
+    ("lever", _LEVER_PATTERN),
+    ("ashby", _ASHBY_PATTERN),
+    ("workday", _WORKDAY_PATTERN),
+]
+
+
+# ---------------------------------------------------------------------------
+# Data models
+# ---------------------------------------------------------------------------
+
+
+class ATSBoard(BaseModel):
+    """Coordinates for a detected ATS board (detection output)."""
+
+    ats_name: str                   # greenhouse | lever | ashby | workday
+    slug: str                       # board slug / company identifier
+    careers_url: str                # canonical human-readable careers board URL
+    wd_host: str | None = None      # workday only: full myworkdayjobs.com host
+    wd_tenant: str | None = None    # workday only: tenant portion of host
+    wd_site: str | None = None      # workday only: site path segment
+
+
+class ATSFetch(BaseModel):
+    """Per-platform API fetch result; single source of truth for response field shapes."""
+
+    first_url: str | None = None    # first open-position URL from the board
+    job_count: int = 0              # total jobs reported by the board
+    org_name: str | None = None     # company_name from the API (Greenhouse only)
+
+
+class ATSResult(BaseModel):
+    """Output of detect_and_fetch / recover_via_slug_guess: careers URL + optional first-job URL."""
+
+    ats_name: str
+    careers_url: str
+    position_url: str | None = None
+    job_count: int = 0
+
+
+# ---------------------------------------------------------------------------
+# Detection
+# ---------------------------------------------------------------------------
+
+
+def detect_ats_in_html(html: str) -> ATSBoard | None:
+    """Scan page HTML for known ATS signals; return the first match or None.
+
+    Tries patterns in order: Greenhouse → Lever → Ashby → Workday.
+    Pure function; no network calls.
+    """
+    for ats_name, pattern in _ATS_CHECKS:
+        m = pattern.search(html)
+        if m:
+            board = _build_board(ats_name, m)
+            if board:
+                return board
+    return None
+
+
+def detect_ats_in_url(url: str) -> ATSBoard | None:
+    """Scan a single URL string for ATS board coordinates; return first match or None.
+
+    Used by the cascade finalizer to recognise when a heuristic-found careers
+    link is itself an ATS board (e.g. jobs.lever.co/acme).
+    Pure function; no network calls.
+    """
+    for ats_name, pattern in _ATS_CHECKS:
+        m = pattern.search(url)
+        if m:
+            board = _build_board(ats_name, m)
+            if board:
+                return board
+    return None
+
+
+def _build_board(ats_name: str, m: re.Match[str]) -> ATSBoard | None:
+    """Construct an ATSBoard from a regex match; return None if slug is empty."""
+    if ats_name == "greenhouse":
+        slug = m.group("slug1") or m.group("slug2") or m.group("slug3") or ""
+        slug = slug.strip()
+        if not slug:
+            return None
+        return ATSBoard(
+            ats_name="greenhouse",
+            slug=slug,
+            careers_url=f"https://boards.greenhouse.io/{slug}",
+        )
+
+    if ats_name == "lever":
+        slug = (m.group("slug") or "").strip()
+        if not slug:
+            return None
+        return ATSBoard(
+            ats_name="lever",
+            slug=slug,
+            careers_url=f"https://jobs.lever.co/{slug}",
+        )
+
+    if ats_name == "ashby":
+        slug = (m.group("slug") or "").strip()
+        if not slug:
+            return None
+        return ATSBoard(
+            ats_name="ashby",
+            slug=slug,
+            careers_url=f"https://jobs.ashbyhq.com/{slug}",
+        )
+
+    if ats_name == "workday":
+        tenant = (m.group("tenant") or "").strip()
+        wdnum = (m.group("wdnum") or "").strip()
+        site = (m.group("site") or "").strip()
+        if not (tenant and wdnum and site):
+            return None
+        host = f"{tenant}.wd{wdnum}.myworkdayjobs.com"
+        return ATSBoard(
+            ats_name="workday",
+            slug=f"{tenant}/{site}",
+            careers_url=f"https://{host}/en-US/{site}",
+            wd_host=host,
+            wd_tenant=tenant,
+            wd_site=site,
+        )
+
+    return None  # pragma: no cover
+
+
+# ---------------------------------------------------------------------------
+# Per-ATS fetch functions — each returns ATSFetch
+# ---------------------------------------------------------------------------
+
+
+def _fetch_greenhouse(board: ATSBoard, client: httpx.Client) -> ATSFetch:
+    """Call Greenhouse boards API; return ATSFetch with first_url, job_count, org_name."""
+    url = f"https://boards-api.greenhouse.io/v1/boards/{board.slug}/jobs?per_page=1"
+    try:
+        resp = request_with_retries(client, "GET", url, max_retries=2)
+        if resp.status_code != 200:
+            logger.warning("greenhouse(%s): HTTP %s", board.slug, resp.status_code)
+            return ATSFetch()
+        data = resp.json()
+        jobs = data.get("jobs") or []
+        count = (data.get("meta") or {}).get("total", len(jobs))
+        first_url = jobs[0].get("absolute_url") if jobs else None
+        org_name = jobs[0].get("company_name") if jobs else None
+        return ATSFetch(first_url=first_url, job_count=count, org_name=org_name)
+    except Exception as exc:
+        logger.warning("greenhouse(%s): fetch error: %s", board.slug, exc)
+        return ATSFetch()
+
+
+def _fetch_lever(board: ATSBoard, client: httpx.Client) -> ATSFetch:
+    """Call Lever postings API; return ATSFetch with first_url and job_count."""
+    url = f"https://api.lever.co/v0/postings/{board.slug}?mode=json&limit=1"
+    try:
+        resp = request_with_retries(client, "GET", url, max_retries=2)
+        if resp.status_code != 200:
+            logger.warning("lever(%s): HTTP %s", board.slug, resp.status_code)
+            return ATSFetch()
+        data = resp.json()
+        if not isinstance(data, list):
+            logger.warning("lever(%s): unexpected response type %s", board.slug, type(data))
+            return ATSFetch()
+        first_url = data[0].get("hostedUrl") if data else None
+        return ATSFetch(first_url=first_url, job_count=len(data))
+    except Exception as exc:
+        logger.warning("lever(%s): fetch error: %s", board.slug, exc)
+        return ATSFetch()
+
+
+def _fetch_ashby(board: ATSBoard, client: httpx.Client) -> ATSFetch:
+    """Call Ashby job-board API (GET); return ATSFetch with first_url and job_count."""
+    url = f"https://api.ashbyhq.com/posting-api/job-board/{board.slug}"
+    try:
+        resp = request_with_retries(client, "GET", url, max_retries=2)
+        if resp.status_code != 200:
+            logger.warning("ashby(%s): HTTP %s", board.slug, resp.status_code)
+            return ATSFetch()
+        data = resp.json()
+        jobs = data.get("jobs") or []
+        first_url = jobs[0].get("jobUrl") if jobs else None
+        return ATSFetch(first_url=first_url, job_count=len(jobs))
+    except Exception as exc:
+        logger.warning("ashby(%s): fetch error: %s", board.slug, exc)
+        return ATSFetch()
+
+
+def _fetch_workday(board: ATSBoard, client: httpx.Client) -> ATSFetch:
+    """Call Workday CXS jobs endpoint (POST); return ATSFetch with first_url and total count."""
+    if not (board.wd_host and board.wd_tenant and board.wd_site):
+        return ATSFetch()
+    api_url = (
+        f"https://{board.wd_host}/wday/cxs/{board.wd_tenant}/{board.wd_site}/jobs"
+    )
+    payload = {"appliedFacets": {}, "limit": 1, "offset": 0, "searchText": ""}
+    try:
+        resp = request_with_retries(
+            client, "POST", api_url,
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            max_retries=2,
+        )
+        if resp.status_code != 200:
+            logger.warning("workday(%s): HTTP %s", board.slug, resp.status_code)
+            return ATSFetch()
+        data = resp.json()
+        total = data.get("total", 0)
+        postings = data.get("jobPostings") or []
+        first_url: str | None = None
+        if postings:
+            ext_path = postings[0].get("externalPath") or ""
+            if ext_path:
+                first_url = (
+                    f"https://{board.wd_host}/en-US/{board.wd_site}{ext_path}"
+                )
+        return ATSFetch(first_url=first_url, job_count=total)
+    except Exception as exc:
+        logger.warning("workday(%s): fetch error: %s", board.slug, exc)
+        return ATSFetch()
+
+
+_FETCH_DISPATCH: dict[str, object] = {
+    "greenhouse": _fetch_greenhouse,
+    "lever": _fetch_lever,
+    "ashby": _fetch_ashby,
+    "workday": _fetch_workday,
+}
+
+
+# ---------------------------------------------------------------------------
+# Public orchestrator — HTML-detection path (Tier 1)
+# ---------------------------------------------------------------------------
+
+
+def detect_and_fetch(
+    website: str,
+    client: httpx.Client,
+    *,
+    homepage_html: str | None = None,
+) -> ATSResult | None:
+    """Detect the ATS for a company website and fetch the first job via its API.
+
+    If *homepage_html* is provided it is used directly (avoids a redundant GET).
+    Returns an ATSResult on success (position_url may be None if the board has
+    no live jobs), or None if no ATS was detected or the homepage fetch failed.
+    """
+    html = homepage_html
+    if html is None:
+        try:
+            resp = request_with_retries(client, "GET", website, max_retries=1)
+            if resp.status_code < 400:
+                html = resp.text
+        except Exception as exc:
+            logger.warning("ats detect_and_fetch(%s): homepage fetch error: %s", website, exc)
+            return None
+
+    if not html:
+        return None
+
+    board = detect_ats_in_html(html)
+    if board is None:
+        return None
+
+    logger.info(
+        "ats(%s): detected %s slug=%s careers_url=%s",
+        website, board.ats_name, board.slug, board.careers_url,
+    )
+
+    fetch_fn = _FETCH_DISPATCH.get(board.ats_name)
+    if fetch_fn is None:  # pragma: no cover
+        return ATSResult(ats_name=board.ats_name, careers_url=board.careers_url)
+
+    fetch = fetch_fn(board, client)  # type: ignore[operator]
+    logger.info(
+        "ats(%s): %s board has %s jobs; first_url=%s",
+        website, board.ats_name, fetch.job_count, fetch.first_url,
+    )
+    return ATSResult(
+        ats_name=board.ats_name,
+        careers_url=board.careers_url,
+        position_url=fetch.first_url,
+        job_count=fetch.job_count,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Slug-guess recovery helpers — Tier 1b (JS-embedded / SPA boards)
+# ---------------------------------------------------------------------------
+
+
+def _domain_stem(website: str) -> str | None:
+    """Extract the first DNS label (lowercased) from a website URL, stripping www.
+
+    Example: 'https://www.anthropic.com' → 'anthropic'
+    """
+    try:
+        parts = urlsplit(website)
+        host = parts.netloc or parts.path.split("/")[0]
+        host = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE).lower()
+        stem = host.split(".")[0]
+        return stem or None
+    except Exception:
+        return None
+
+
+def _slug_candidates(website: str, company_name: str | None) -> list[str]:
+    """Return ordered, deduped ATS slug candidates (≤ 3) for a company.
+
+    Order: domain stem first (most specific/unique), then normalized company name.
+    """
+    candidates: list[str] = []
+    seen: set[str] = set()
+
+    stem = _domain_stem(website)
+    if stem and stem not in seen:
+        candidates.append(stem)
+        seen.add(stem)
+
+    if company_name:
+        name_slug = _slug(company_name)
+        if name_slug and name_slug not in seen:
+            candidates.append(name_slug)
+            seen.add(name_slug)
+
+    return candidates[:3]
+
+
+def _board_from_slug(ats_name: str, slug: str) -> ATSBoard:
+    """Construct an ATSBoard for Greenhouse / Lever / Ashby slug-guess probing."""
+    if ats_name == "greenhouse":
+        return ATSBoard(
+            ats_name="greenhouse",
+            slug=slug,
+            careers_url=f"https://boards.greenhouse.io/{slug}",
+        )
+    if ats_name == "lever":
+        return ATSBoard(
+            ats_name="lever",
+            slug=slug,
+            careers_url=f"https://jobs.lever.co/{slug}",
+        )
+    # ashby
+    return ATSBoard(
+        ats_name="ashby",
+        slug=slug,
+        careers_url=f"https://jobs.ashbyhq.com/{slug}",
+    )
+
+
+def _loose_name_match(input_name: str, org_name: str) -> bool:
+    """Return True if company names loosely match (slug of one is a substring of the other).
+
+    If either side normalizes to empty, returns True (unverifiable → don't reject).
+    """
+    a = _slug(input_name) or ""
+    b = _slug(org_name) or ""
+    if not a or not b:
+        return True
+    return a in b or b in a
+
+
+# Platforms probed in slug-guess recovery (Workday excluded — needs tenant+site).
+_SLUG_GUESS_PLATFORMS = ("greenhouse", "lever", "ashby")
+
+
+def recover_via_slug_guess(
+    website: str,
+    company_name: str | None,
+    client: httpx.Client,
+) -> ATSResult | None:
+    """Probe Greenhouse/Lever/Ashby APIs with guessed slugs when HTML detection misses.
+
+    Used as Tier 1b in the cascade — catches companies whose ATS board is
+    injected via client-side JS and therefore invisible to static HTML detection
+    (e.g. Anthropic's Greenhouse board rendered by Next.js).
+
+    Slug candidates: domain stem first (e.g. anthropic.com → 'anthropic'), then
+    the normalized company name. First board with job_count > 0 wins.
+
+    False-positive guard: if Greenhouse returns a company_name, it is loosely
+    cross-checked against the input company_name; a clear mismatch is rejected.
+
+    Returns ATSResult on success or None on all-miss.
+    """
+    candidates = _slug_candidates(website, company_name)
+    if not candidates:
+        return None
+
+    for slug in candidates:
+        for ats_name in _SLUG_GUESS_PLATFORMS:
+            board = _board_from_slug(ats_name, slug)
+            fetch_fn = _FETCH_DISPATCH.get(ats_name)
+            if fetch_fn is None:  # pragma: no cover
+                continue
+            try:
+                fetch = fetch_fn(board, client)  # type: ignore[operator]
+            except Exception as exc:
+                logger.debug(
+                    "recover_via_slug_guess(%s/%s/%s): error: %s",
+                    website, ats_name, slug, exc,
+                )
+                continue
+
+            if not fetch.first_url or fetch.job_count == 0:
+                continue
+
+            # Cross-check org name if the platform provides it (Greenhouse only).
+            if company_name and fetch.org_name:
+                if not _loose_name_match(company_name, fetch.org_name):
+                    logger.info(
+                        "recover_via_slug_guess(%s): %s slug=%s org_name=%r "
+                        "does not match input %r — skipping",
+                        website, ats_name, slug, fetch.org_name, company_name,
+                    )
+                    continue
+
+            logger.info(
+                "recover_via_slug_guess(%s): hit %s slug=%s jobs=%s careers_url=%s",
+                website, ats_name, slug, fetch.job_count, board.careers_url,
+            )
+            return ATSResult(
+                ats_name=ats_name,
+                careers_url=board.careers_url,
+                position_url=fetch.first_url,
+                job_count=fetch.job_count,
+            )
+
+    logger.info("recover_via_slug_guess(%s): all candidates missed", website)
+    return None
--- a/jobsource/careers/cascade.py
+++ b/jobsource/careers/cascade.py
@@ -1,13 +1,234 @@
 """find_careers_page(): orchestrate the Stage 2 tier cascade.

-Scaffold stub -- not implemented yet.
+Cascade order (return early on first success):
+  1. ATS detection → ats.detect_and_fetch()       confidence 0.95
+  2. URL patterns  → heuristics.probe_url_patterns()  0.80
+  3. Homepage scan → heuristics.scan_homepage_links()  0.60
+  4. Sitemap       → heuristics.parse_sitemap()        0.50
+  5. Cheap-LLM     → classify_llm (stub, not implemented in this phase)
+  6. Browser agent → agent_fallback (stub, not implemented in this phase)
+
+Returns a CareersResult with the URL, confidence, method string, and — when
+the ATS tier resolves — the first open-position URL for free (Stage-3 shortcut).
+
+The optional *client* parameter follows the managed-client pattern from
+resolve.py: supply an existing httpx.Client to reuse connections; otherwise a
+short-lived client is created and closed here.
 """
-# TODO (Stage 2): implement per CLAUDE.md "Stage 2 — Find careers page (cascade, return on first hit)".
-# Cascade order (return early on first success):
-#   1. ATS detection  → ats.detect_and_fetch()
-#   2. URL patterns   → heuristics.probe_url_patterns()
-#   3. Homepage scan  → heuristics.scan_homepage_links()
-#   4. Sitemap        → heuristics.parse_sitemap()
-#   5. Cheap-LLM      → classify_llm.classify_careers_link()
-#   6. Browser agent  → agent_fallback.run_fused_agent()  (also handles Stage 3)
-# Returns (careers_url: str | None, method: str, ats_name: str | None).
+from __future__ import annotations
+
+import logging
+
+import httpx
+from pydantic import BaseModel
+
+from ..http import build_client, request_with_retries
+from . import ats as _ats
+from . import heuristics as _heuristics
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Result model
+# ---------------------------------------------------------------------------
+
+
+class CareersResult(BaseModel):
+    """Typed output of find_careers_page()."""
+
+    careers_url: str | None = None
+    confidence: float = 0.0
+    # method values: "ats:{name}" | "url_pattern" | "homepage_scan" | "sitemap" | "none"
+    method: str = "none"
+    ats_name: str | None = None
+    # Free Stage-3 shortcut: populated when ATS tier resolves (first open job URL).
+    position_url: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+def find_careers_page(
+    website: str,
+    *,
+    company_name: str | None = None,
+    client: httpx.Client | None = None,
+) -> CareersResult:
+    """Run the careers-page discovery cascade for one company website.
+
+    *company_name* is optional; when provided it supplies a second slug candidate
+    for the Tier 1b slug-guess recovery and enables the org-name cross-check.
+    Returns a CareersResult. Never raises — tier failures fall through gracefully.
+    """
+    _managed = client is None
+    if _managed:
+        client = build_client()
+
+    try:
+        # Fetch the homepage once; shared by ATS detection and homepage-link scan.
+        homepage_html: str | None = _safe_get_html(website, client)
+
+        # ------------------------------------------------------------------
+        # Tier 1 — ATS detection + public JSON API
+        # ------------------------------------------------------------------
+        try:
+            ats_result = _ats.detect_and_fetch(
+                website, client, homepage_html=homepage_html
+            )
+            if ats_result is not None:
+                logger.info(
+                    "cascade(%s): resolved via ats:%s careers_url=%s",
+                    website, ats_result.ats_name, ats_result.careers_url,
+                )
+                return CareersResult(
+                    careers_url=ats_result.careers_url,
+                    confidence=0.95,
+                    method=f"ats:{ats_result.ats_name}",
+                    ats_name=ats_result.ats_name,
+                    position_url=ats_result.position_url,
+                )
+        except Exception as exc:
+            logger.warning("cascade(%s): ats tier error: %s", website, exc)
+
+        # ------------------------------------------------------------------
+        # Tier 1b — Slug-guess ATS recovery (JS-embedded / SPA boards)
+        # ------------------------------------------------------------------
+        try:
+            rec = _ats.recover_via_slug_guess(website, company_name, client)
+            if rec is not None:
+                logger.info(
+                    "cascade(%s): resolved via ats:%s:slug_guess careers_url=%s",
+                    website, rec.ats_name, rec.careers_url,
+                )
+                return CareersResult(
+                    careers_url=rec.careers_url,
+                    confidence=0.90,
+                    method=f"ats:{rec.ats_name}:slug_guess",
+                    ats_name=rec.ats_name,
+                    position_url=rec.position_url,
+                )
+        except Exception as exc:
+            logger.warning("cascade(%s): slug_guess tier error: %s", website, exc)
+
+        # ------------------------------------------------------------------
+        # Tier 2 — URL-pattern probing
+        # ------------------------------------------------------------------
+        try:
+            url = _heuristics.probe_url_patterns(website, client)
+            if url:
+                return _finalize(url, "url_pattern", 0.80, website, client)
+        except Exception as exc:
+            logger.warning("cascade(%s): url_pattern tier error: %s", website, exc)
+
+        # ------------------------------------------------------------------
+        # Tier 3 — Homepage link scan (reuse fetched HTML)
+        # ------------------------------------------------------------------
+        try:
+            url = _heuristics.scan_homepage_links(
+                website, client, homepage_html=homepage_html
+            )
+            if url:
+                return _finalize(url, "homepage_scan", 0.60, website, client)
+        except Exception as exc:
+            logger.warning("cascade(%s): homepage_scan tier error: %s", website, exc)
+
+        # ------------------------------------------------------------------
+        # Tier 4 — Sitemap
+        # ------------------------------------------------------------------
+        try:
+            url = _heuristics.parse_sitemap(website, client)
+            if url:
+                return _finalize(url, "sitemap", 0.50, website, client)
+        except Exception as exc:
+            logger.warning("cascade(%s): sitemap tier error: %s", website, exc)
+
+        # All deterministic tiers missed.
+        logger.info("cascade(%s): all deterministic tiers missed", website)
+        return CareersResult()
+
+    finally:
+        if _managed:
+            client.close()
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _detect_ats_in_page(url: str, client: httpx.Client) -> "_ats.ATSBoard | None":
+    """Fetch a page and run ATS detection on its HTML; return ATSBoard or None."""
+    try:
+        resp = request_with_retries(client, "GET", url, max_retries=0)
+        if resp.status_code < 400:
+            return _ats.detect_ats_in_html(resp.text)
+    except Exception:
+        pass
+    return None
+
+
+def _safe_get_html(website: str, client: httpx.Client) -> str | None:
+    """Best-effort homepage fetch; return HTML text or None on any failure."""
+    try:
+        resp = request_with_retries(client, "GET", website, max_retries=1)
+        if resp.status_code < 400:
+            return resp.text
+        logger.info("cascade: homepage GET %s returned HTTP %s", website, resp.status_code)
+        return None
+    except Exception as exc:
+        logger.info("cascade: homepage GET %s error: %s", website, exc)
+        return None
+
+
+def _finalize(
+    url: str,
+    method: str,
+    confidence: float,
+    website: str,
+    client: httpx.Client,
+) -> CareersResult:
+    """Attempt ATS-URL upgrade for heuristic hits; return a CareersResult.
+
+    If the URL resolved by a heuristic tier is itself an ATS board page
+    (e.g. jobs.lever.co/acme), detect and fetch it so we can return a
+    position_url and upgrade the confidence to 0.95.
+    """
+    board = _ats.detect_ats_in_url(url)
+    if board is None:
+        # The URL itself isn't an ATS board link; fetch the page and check its HTML.
+        # This catches companies like Vercel/Figma whose Greenhouse embed is only on /careers.
+        board = _detect_ats_in_page(url, client)
+    if board is not None:
+        try:
+            fetch_fn = _ats._FETCH_DISPATCH.get(board.ats_name)
+            if fetch_fn is not None:
+                fetch = fetch_fn(board, client)  # type: ignore[operator]
+                upgraded_method = f"ats:{board.ats_name}"
+                logger.info(
+                    "cascade(%s): %s hit upgraded to %s careers_url=%s",
+                    website, method, upgraded_method, board.careers_url,
+                )
+                return CareersResult(
+                    careers_url=board.careers_url,
+                    confidence=0.95,
+                    method=upgraded_method,
+                    ats_name=board.ats_name,
+                    position_url=fetch.first_url,
+                )
+        except Exception as exc:
+            logger.warning(
+                "cascade(%s): ats upgrade for %s failed: %s", website, method, exc
+            )
+
+    logger.info(
+        "cascade(%s): resolved via %s careers_url=%s confidence=%.2f",
+        website, method, url, confidence,
+    )
+    return CareersResult(
+        careers_url=url,
+        confidence=confidence,
+        method=method,
+    )
--- a/jobsource/careers/heuristics.py
+++ b/jobsource/careers/heuristics.py
@@ -1,11 +1,334 @@
 """Deterministic careers-page heuristics: URL probing, homepage scan, sitemap (Stage 2, tiers 2–4).

-Scaffold stub -- not implemented yet.
+Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join and
+          subdomains careers.{domain}, jobs.{domain} via HTTP HEAD/GET.
+Tier 3 — Homepage link scan: parse <a> anchors from the homepage HTML, rank
+          by career/job keywords in href and text, return highest-scored link.
+Tier 4 — Sitemap: fetch /sitemap.xml (and sitemap index children), scan <loc>
+          elements for career/job keywords, return the first match.
 """
-# TODO (Stage 2, tiers 2–4): implement per CLAUDE.md "Stage 2 — URL patterns / homepage / sitemap".
-# Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join,
-#           careers.{domain}, jobs.{domain} via HTTP HEAD (or GET if HEAD fails).
-# Tier 3 — Homepage link scan: fetch homepage HTML, parse with BeautifulSoup + lxml,
-#           rank <a> anchors by career/job keywords in href/text, return highest-ranked.
-# Tier 4 — Sitemap: fetch sitemap.xml (and sitemap index if present), scan for career/job URLs.
-# Each function returns (url: str | None) so cascade.py can return early on first hit.
+from __future__ import annotations
+
+import logging
+import re
+from urllib.parse import urljoin, urlsplit
+
+import httpx
+from bs4 import BeautifulSoup
+
+from ..http import probe_url, request_with_retries
+
+logger = logging.getLogger(__name__)
+
+# URL path segments and keywords that signal a careers page.
+_CAREER_PATH_PATTERNS: list[re.Pattern[str]] = [
+    re.compile(p, re.IGNORECASE)
+    for p in [
+        r"/career",         # /careers, /career
+        r"/job",            # /jobs, /job-listings
+        r"/join",           # /join-us, /join-our-team
+        r"/work-with",      # /work-with-us
+        r"/we-re-hiring",
+        r"/openings?",
+        r"/opportunities",
+        r"/positions?",
+        r"/vacancies",
+        r"/hiring",
+    ]
+]
+
+# Weighted keyword scoring for anchor text and href paths.
+# Tuples: (compiled pattern, score)
+_HREF_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
+    (re.compile(r"/career", re.I), 3.0),
+    (re.compile(r"/job", re.I), 2.5),
+    (re.compile(r"/join", re.I), 2.0),
+    (re.compile(r"/opening", re.I), 2.0),
+    (re.compile(r"/position", re.I), 2.0),
+    (re.compile(r"/opportunit", re.I), 1.5),
+    (re.compile(r"/work.with", re.I), 1.5),
+    (re.compile(r"/hiring", re.I), 1.5),
+    (re.compile(r"/vacanc", re.I), 1.5),
+]
+
+_TEXT_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
+    (re.compile(r"\bcareers?\b", re.I), 3.0),
+    (re.compile(r"\bjobs?\b", re.I), 2.5),
+    (re.compile(r"\bjoin\s+us\b", re.I), 2.0),
+    (re.compile(r"\bopen\s+positions?\b", re.I), 2.0),
+    (re.compile(r"\bwork\s+with\s+us\b", re.I), 1.5),
+    (re.compile(r"\bwe.?re\s+hiring\b", re.I), 1.5),
+    (re.compile(r"\bopportunities\b", re.I), 1.5),
+    (re.compile(r"\bhiring\b", re.I), 1.0),
+]
+
+# Minimum score to accept a homepage link as a careers page candidate.
+_SCORE_THRESHOLD = 2.0
+
+# Maximum child sitemaps to fetch when processing a sitemap index.
+_MAX_SITEMAP_CHILDREN = 5
+
+# Maximum <loc> entries to scan across all sitemaps.
+_MAX_LOC_ENTRIES = 500
+
+# Soft-404 path indicators used to reject SPA-style error pages.
+_SOFT_404_INDICATORS = ("notfound", "/404", "not-found", "/pagenot", "/error-page")
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_plausible_careers_url(original_url: str, final_url: str) -> bool:
+    """Return False for obvious false positives from redirect chains.
+
+    Rejects:
+    - SPA soft-404 paths: final URL path contains "notfound", "/404", "not-found"
+      (e.g. Netflix /careers → /NotFound?prev=...)
+    - Off-brand cross-domain redirects: final domain shares no brand with original
+      (e.g. microsoft.com/careers → bing.com)
+
+    Accepts legitimate cross-domain redirects where the brand is preserved
+    (e.g. amazon.com → amazon.jobs).
+    """
+    orig = urlsplit(original_url)
+    final = urlsplit(final_url)
+
+    # 1. Reject soft-404 path indicators (case-insensitive).
+    final_path_lower = final.path.lower()
+    if any(indicator in final_path_lower for indicator in _SOFT_404_INDICATORS):
+        return False
+
+    # 2. If domain changed, verify the brand name survives in the new host.
+    orig_host = re.sub(r"^www\.", "", orig.netloc, flags=re.IGNORECASE).lower()
+    final_host = re.sub(r"^www\.", "", final.netloc, flags=re.IGNORECASE).lower()
+    if orig_host != final_host:
+        brand = orig_host.split(".")[0]   # e.g. "microsoft" from "microsoft.com"
+        if len(brand) > 3 and brand not in final_host:
+            return False
+
+    return True
+
+
+def _base_parts(website: str) -> tuple[str, str, str]:
+    """Return (scheme, host, root_domain) for a website URL.
+
+    root_domain strips a leading 'www.' from the host so that subdomain
+    candidates like 'careers.{root_domain}' are formed correctly.
+    Example: 'https://www.acme.com/about' → ('https', 'www.acme.com', 'acme.com')
+    """
+    parts = urlsplit(website)
+    scheme = parts.scheme or "https"
+    host = parts.netloc or parts.path.split("/")[0]
+    root = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE)
+    return scheme, host, root
+
+
+# ---------------------------------------------------------------------------
+# Tier 2 — URL-pattern probing
+# ---------------------------------------------------------------------------
+
+
+def probe_url_patterns(website: str, client: httpx.Client) -> str | None:
+    """Probe well-known career URL paths and subdomains; return first reachable URL.
+
+    Probes in order:
+      /careers, /career, /jobs, /join-us, /join
+      careers.{root_domain}, jobs.{root_domain}
+    Uses HTTP HEAD with GET fallback via http.probe_url.
+    """
+    scheme, host, root = _base_parts(website)
+    base = f"{scheme}://{host}"
+
+    candidates: list[str] = [
+        f"{base}/careers",
+        f"{base}/career",
+        f"{base}/jobs",
+        f"{base}/join-us",
+        f"{base}/join",
+        f"{scheme}://careers.{root}",
+        f"{scheme}://jobs.{root}",
+    ]
+
+    for candidate in candidates:
+        result = probe_url(client, candidate)
+        if result and _is_plausible_careers_url(candidate, result):
+            logger.info("heuristics.probe_url_patterns(%s): hit url=%s", website, result)
+            return result
+        if result:
+            logger.info(
+                "heuristics.probe_url_patterns(%s): rejected redirect %s → %s",
+                website, candidate, result,
+            )
+
+    logger.info("heuristics.probe_url_patterns(%s): no pattern matched", website)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Tier 3 — Homepage link scan
+# ---------------------------------------------------------------------------
+
+
+def scan_homepage_links(
+    website: str,
+    client: httpx.Client,
+    *,
+    homepage_html: str | None = None,
+) -> str | None:
+    """Rank <a> anchors on the homepage by career/job keywords; return best match.
+
+    If *homepage_html* is provided it is used directly. Otherwise the homepage
+    is fetched. Returns the highest-scoring anchor href that exceeds the
+    threshold, or None.
+    """
+    html = homepage_html
+    if html is None:
+        try:
+            resp = request_with_retries(client, "GET", website, max_retries=1)
+            if resp.status_code >= 400:
+                logger.warning(
+                    "heuristics.scan_homepage_links(%s): HTTP %s", website, resp.status_code
+                )
+                return None
+            html = resp.text
+        except Exception as exc:
+            logger.warning(
+                "heuristics.scan_homepage_links(%s): fetch error: %s", website, exc
+            )
+            return None
+
+    if not html:
+        return None
+
+    soup = BeautifulSoup(html, "lxml")
+    best_url: str | None = None
+    best_score: float = 0.0
+
+    for tag in soup.find_all("a", href=True):
+        href: str = tag["href"].strip()
+        # Skip non-HTTP links and fragment-only anchors.
+        if not href or href.startswith(("mailto:", "tel:", "#", "javascript:")):
+            continue
+
+        full_url = urljoin(website, href)
+        # Keep only http(s) links.
+        if not full_url.startswith(("http://", "https://")):
+            continue
+
+        score = _score_anchor(href, tag.get_text(separator=" ", strip=True))
+        if score > best_score:
+            best_score = score
+            best_url = full_url
+
+    if best_url and best_score >= _SCORE_THRESHOLD:
+        logger.info(
+            "heuristics.scan_homepage_links(%s): hit url=%s score=%.1f",
+            website, best_url, best_score,
+        )
+        return best_url
+
+    logger.info(
+        "heuristics.scan_homepage_links(%s): no link above threshold (best=%.1f)",
+        website, best_score,
+    )
+    return None
+
+
+def _score_anchor(href: str, text: str) -> float:
+    """Compute a relevance score for an anchor based on its href path and text."""
+    score = 0.0
+    for pattern, weight in _HREF_WEIGHTS:
+        if pattern.search(href):
+            score += weight
+    for pattern, weight in _TEXT_WEIGHTS:
+        if pattern.search(text):
+            score += weight
+    return score
+
+
+# ---------------------------------------------------------------------------
+# Tier 4 — Sitemap
+# ---------------------------------------------------------------------------
+
+_CAREER_URL_RE = re.compile(
+    r"/(career|job|join|opening|position|opportunit|vacanc|hiring)",
+    re.IGNORECASE,
+)
+
+
+def parse_sitemap(website: str, client: httpx.Client) -> str | None:
+    """Fetch /sitemap.xml and scan <loc> URLs for career/job keywords.
+
+    If the sitemap is an index, fetches up to _MAX_SITEMAP_CHILDREN child
+    sitemaps. Scans up to _MAX_LOC_ENTRIES <loc> entries in total.
+    Returns the first matching URL, or None.
+    """
+    scheme, host, _ = _base_parts(website)
+    sitemap_url = f"{scheme}://{host}/sitemap.xml"
+    try:
+        xml = _fetch_xml(client, sitemap_url)
+    except Exception as exc:
+        logger.info("heuristics.parse_sitemap(%s): fetch error: %s", website, exc)
+        return None
+
+    if xml is None:
+        logger.info("heuristics.parse_sitemap(%s): sitemap not found", website)
+        return None
+
+    result = _scan_sitemap_xml(xml, website, client)
+    if result:
+        logger.info("heuristics.parse_sitemap(%s): hit url=%s", website, result)
+    else:
+        logger.info("heuristics.parse_sitemap(%s): no career URL found", website)
+    return result
+
+
+def _fetch_xml(client: httpx.Client, url: str) -> str | None:
+    """GET a URL and return the text if the response is < 400, else None."""
+    resp = request_with_retries(client, "GET", url, max_retries=1)
+    if resp.status_code >= 400:
+        return None
+    return resp.text
+
+
+def _scan_sitemap_xml(xml: str, website: str, client: httpx.Client) -> str | None:
+    """Parse sitemap XML; handle sitemap index by fetching children."""
+    soup = BeautifulSoup(xml, "xml")
+
+    # Sitemap index: contains <sitemap><loc>…</loc></sitemap>
+    sitemap_tags = soup.find_all("sitemap")
+    if sitemap_tags:
+        child_locs = [
+            tag.find("loc").get_text(strip=True)
+            for tag in sitemap_tags
+            if tag.find("loc")
+        ]
+        for child_url in child_locs[:_MAX_SITEMAP_CHILDREN]:
+            try:
+                child_xml = _fetch_xml(client, child_url)
+            except Exception:
+                continue
+            if child_xml:
+                result = _scan_locs(child_xml)
+                if result:
+                    return result
+        return None
+
+    # Plain sitemap: contains <url><loc>…</loc></url>
+    return _scan_locs(xml)
+
+
+def _scan_locs(xml: str) -> str | None:
+    """Scan <loc> elements in a sitemap for career/job keywords."""
+    soup = BeautifulSoup(xml, "xml")
+    count = 0
+    for loc_tag in soup.find_all("loc"):
+        if count >= _MAX_LOC_ENTRIES:
+            break
+        count += 1
+        loc: str = loc_tag.get_text(strip=True)
+        if _CAREER_URL_RE.search(loc):
+            return loc
+    return None
--- a/jobsource/http.py
+++ b/jobsource/http.py
@@ -18,6 +18,7 @@ from .config import get_settings
 logger = logging.getLogger(__name__)

 _RETRY_STATUS = frozenset({429, 500, 502, 503, 504})
+_HEAD_NOT_SUPPORTED = frozenset({405, 501})


 def default_headers() -> dict[str, str]:
@@ -95,3 +96,21 @@ def request_with_retries(
    if last_exc is not None:  # pragma: no cover - defensive
        raise last_exc
    raise RuntimeError("request_with_retries exhausted without a response")
+
+
+def probe_url(client: httpx.Client, url: str) -> str | None:
+    """Probe a URL with HEAD (fallback GET on 405/501); return final URL or None.
+
+    Returns the str representation of the final URL after redirects when the
+    server responds with a non-error status (<400). Returns None on any
+    network error or error status.
+    """
+    try:
+        resp = request_with_retries(client, "HEAD", url, max_retries=1)
+        if resp.status_code in _HEAD_NOT_SUPPORTED:
+            resp = request_with_retries(client, "GET", url, max_retries=1)
+        if resp.status_code < 400:
+            return str(resp.url)
+        return None
+    except Exception:
+        return None