phase2-ATS + heuristic careers finding

2026-06-17 17:33:11 -04:00
parent cd9ab9b95e
commit 113a4ced36
11 changed files with 2836 additions and 39 deletions
--- a/tests/test_ats.py
+++ b/tests/test_ats.py
@@ -0,0 +1,749 @@
+"""Tests for jobsource/careers/ats.py — all network-free via monkeypatching."""
+from __future__ import annotations
+
+import pytest
+
+from jobsource.careers.ats import (
+    ATSBoard,
+    ATSFetch,
+    ATSResult,
+    _board_from_slug,
+    _domain_stem,
+    _fetch_ashby,
+    _fetch_greenhouse,
+    _fetch_lever,
+    _fetch_workday,
+    _loose_name_match,
+    _slug_candidates,
+    detect_and_fetch,
+    detect_ats_in_html,
+    detect_ats_in_url,
+    recover_via_slug_guess,
+)
+
+
+# ---------------------------------------------------------------------------
+# Tiny fake HTTP response for monkeypatching request_with_retries
+# ---------------------------------------------------------------------------
+
+
+class FakeResponse:
+    def __init__(self, status_code: int, body: object, url: str = "https://example.com"):
+        self.status_code = status_code
+        self._body = body
+        self.url = url
+        self.text = str(body)
+
+    def json(self) -> object:
+        return self._body
+
+
+class FakeClient:
+    """Stands in for httpx.Client; never actually used in network calls here."""
+
+
+# ---------------------------------------------------------------------------
+# detect_ats_in_html — Greenhouse
+# ---------------------------------------------------------------------------
+
+
+class TestDetectATSInHtmlGreenhouse:
+    def test_boards_greenhouse_script_tag(self):
+        html = '<script src="https://boards.greenhouse.io/embed/job_board?for=airbnb"></script>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "greenhouse"
+        assert board.slug == "airbnb"
+        assert board.careers_url == "https://boards.greenhouse.io/airbnb"
+
+    def test_boards_greenhouse_direct_link(self):
+        html = '<a href="https://boards.greenhouse.io/acme">Jobs</a>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "greenhouse"
+        assert board.slug == "acme"
+
+    def test_job_boards_subdomain(self):
+        html = '<a href="https://job-boards.greenhouse.io/stripe">Jobs</a>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "greenhouse"
+        assert board.slug == "stripe"
+
+    def test_no_match_returns_none(self):
+        assert detect_ats_in_html("<html><body>Nothing here</body></html>") is None
+
+
+# ---------------------------------------------------------------------------
+# detect_ats_in_html — Lever
+# ---------------------------------------------------------------------------
+
+
+class TestDetectATSInHtmlLever:
+    def test_jobs_lever_link(self):
+        html = '<a href="https://jobs.lever.co/leverdemo">Open roles</a>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "lever"
+        assert board.slug == "leverdemo"
+        assert board.careers_url == "https://jobs.lever.co/leverdemo"
+
+    def test_lever_embed_script(self):
+        html = 'var lever = "jobs.lever.co/acme-corp";'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "lever"
+        assert board.slug == "acme-corp"
+
+
+# ---------------------------------------------------------------------------
+# detect_ats_in_html — Ashby
+# ---------------------------------------------------------------------------
+
+
+class TestDetectATSInHtmlAshby:
+    def test_jobs_ashbyhq_link(self):
+        html = '<a href="https://jobs.ashbyhq.com/Ramp">Careers</a>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "ashby"
+        assert board.slug == "Ramp"
+        assert board.careers_url == "https://jobs.ashbyhq.com/Ramp"
+
+    def test_lowercase_slug(self):
+        html = '<a href="https://jobs.ashbyhq.com/linear">Join us</a>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.slug == "linear"
+
+
+# ---------------------------------------------------------------------------
+# detect_ats_in_html — Workday
+# ---------------------------------------------------------------------------
+
+
+class TestDetectATSInHtmlWorkday:
+    def test_myworkdayjobs_link(self):
+        html = '<a href="https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite">Jobs</a>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "workday"
+        assert board.wd_host == "nvidia.wd5.myworkdayjobs.com"
+        assert board.wd_tenant == "nvidia"
+        assert board.wd_site == "NVIDIAExternalCareerSite"
+        assert "en-US" in board.careers_url
+        assert board.careers_url == "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite"
+
+    def test_workday_without_locale(self):
+        html = '<a href="https://acme.wd1.myworkdayjobs.com/AcmeCareers">Careers</a>'
+        board = detect_ats_in_html(html)
+        assert board is not None
+        assert board.ats_name == "workday"
+        assert board.wd_site == "AcmeCareers"
+
+    def test_workday_missing_site_returns_none(self):
+        # Just the host with no path — can't form a board
+        html = 'https://acme.wd1.myworkdayjobs.com'
+        board = detect_ats_in_html(html)
+        assert board is None
+
+
+# ---------------------------------------------------------------------------
+# detect_ats_in_url
+# ---------------------------------------------------------------------------
+
+
+class TestDetectATSInUrl:
+    def test_greenhouse_url(self):
+        board = detect_ats_in_url("https://boards.greenhouse.io/stripe")
+        assert board is not None
+        assert board.ats_name == "greenhouse"
+        assert board.slug == "stripe"
+
+    def test_lever_url(self):
+        board = detect_ats_in_url("https://jobs.lever.co/leverdemo")
+        assert board is not None
+        assert board.ats_name == "lever"
+
+    def test_ashby_url(self):
+        board = detect_ats_in_url("https://jobs.ashbyhq.com/linear")
+        assert board is not None
+        assert board.ats_name == "ashby"
+
+    def test_workday_url(self):
+        board = detect_ats_in_url(
+            "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite"
+        )
+        assert board is not None
+        assert board.ats_name == "workday"
+
+    def test_non_ats_url_returns_none(self):
+        assert detect_ats_in_url("https://www.acme.com/careers") is None
+
+
+# ---------------------------------------------------------------------------
+# _fetch_greenhouse
+# ---------------------------------------------------------------------------
+
+
+class TestFetchGreenhouse:
+    def _board(self, slug: str = "airbnb") -> ATSBoard:
+        return ATSBoard(
+            ats_name="greenhouse",
+            slug=slug,
+            careers_url=f"https://boards.greenhouse.io/{slug}",
+        )
+
+    def test_extracts_absolute_url(self, monkeypatch):
+        fake_body = {
+            "jobs": [{"absolute_url": "https://careers.airbnb.com/positions/123",
+                      "company_name": "Airbnb"}],
+            "meta": {"total": 42},
+        }
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda client, method, url, **kw: FakeResponse(200, fake_body),
+        )
+        fetch = _fetch_greenhouse(self._board(), FakeClient())
+        assert fetch.first_url == "https://careers.airbnb.com/positions/123"
+        assert fetch.job_count == 42
+        assert fetch.org_name == "Airbnb"
+
+    def test_empty_jobs_list(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, {"jobs": [], "meta": {"total": 0}}),
+        )
+        fetch = _fetch_greenhouse(self._board(), FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+        assert fetch.org_name is None
+
+    def test_non_200_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(404, {}),
+        )
+        fetch = _fetch_greenhouse(self._board(), FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+
+    def test_network_error_returns_none(self, monkeypatch):
+        def boom(*a, **kw):
+            raise RuntimeError("network down")
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", boom)
+        fetch = _fetch_greenhouse(self._board(), FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+
+
+# ---------------------------------------------------------------------------
+# _fetch_lever
+# ---------------------------------------------------------------------------
+
+
+class TestFetchLever:
+    def _board(self, slug: str = "leverdemo") -> ATSBoard:
+        return ATSBoard(
+            ats_name="lever",
+            slug=slug,
+            careers_url=f"https://jobs.lever.co/{slug}",
+        )
+
+    def test_extracts_hosted_url(self, monkeypatch):
+        fake_body = [{"hostedUrl": "https://jobs.lever.co/leverdemo/abc-123"}]
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, fake_body),
+        )
+        fetch = _fetch_lever(self._board(), FakeClient())
+        assert fetch.first_url == "https://jobs.lever.co/leverdemo/abc-123"
+        assert fetch.job_count == 1
+
+    def test_empty_list(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, []),
+        )
+        fetch = _fetch_lever(self._board(), FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+
+    def test_non_list_response(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, {"error": "not found"}),
+        )
+        fetch = _fetch_lever(self._board(), FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+
+    def test_non_200_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(404, []),
+        )
+        fetch = _fetch_lever(self._board(), FakeClient())
+        assert fetch.first_url is None
+
+
+# ---------------------------------------------------------------------------
+# _fetch_ashby
+# ---------------------------------------------------------------------------
+
+
+class TestFetchAshby:
+    def _board(self, slug: str = "Ramp") -> ATSBoard:
+        return ATSBoard(
+            ats_name="ashby",
+            slug=slug,
+            careers_url=f"https://jobs.ashbyhq.com/{slug}",
+        )
+
+    def test_extracts_job_url(self, monkeypatch):
+        fake_body = {
+            "jobs": [{"jobUrl": "https://jobs.ashbyhq.com/Ramp/abc-def"}],
+            "apiVersion": "1",
+        }
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, fake_body),
+        )
+        fetch = _fetch_ashby(self._board(), FakeClient())
+        assert fetch.first_url == "https://jobs.ashbyhq.com/Ramp/abc-def"
+        assert fetch.job_count == 1
+
+    def test_empty_jobs(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, {"jobs": []}),
+        )
+        fetch = _fetch_ashby(self._board(), FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+
+    def test_network_error_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")),
+        )
+        fetch = _fetch_ashby(self._board(), FakeClient())
+        assert fetch.first_url is None
+
+
+# ---------------------------------------------------------------------------
+# _fetch_workday
+# ---------------------------------------------------------------------------
+
+
+class TestFetchWorkday:
+    def _board(self) -> ATSBoard:
+        return ATSBoard(
+            ats_name="workday",
+            slug="nvidia/NVIDIAExternalCareerSite",
+            careers_url="https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
+            wd_host="nvidia.wd5.myworkdayjobs.com",
+            wd_tenant="nvidia",
+            wd_site="NVIDIAExternalCareerSite",
+        )
+
+    def test_builds_full_job_url(self, monkeypatch):
+        fake_body = {
+            "total": 2000,
+            "jobPostings": [{"externalPath": "/job/US/SWE_JR123"}],
+        }
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, fake_body),
+        )
+        fetch = _fetch_workday(self._board(), FakeClient())
+        assert fetch.first_url == "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite/job/US/SWE_JR123"
+        assert fetch.job_count == 2000
+
+    def test_empty_postings(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, {"total": 0, "jobPostings": []}),
+        )
+        fetch = _fetch_workday(self._board(), FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+
+    def test_missing_wd_coords_returns_none(self):
+        board = ATSBoard(
+            ats_name="workday", slug="x", careers_url="https://x.wd1.myworkdayjobs.com"
+        )
+        fetch = _fetch_workday(board, FakeClient())
+        assert fetch.first_url is None
+        assert fetch.job_count == 0
+
+    def test_non_200_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(403, {}),
+        )
+        fetch = _fetch_workday(self._board(), FakeClient())
+        assert fetch.first_url is None
+
+
+# ---------------------------------------------------------------------------
+# detect_and_fetch orchestration
+# ---------------------------------------------------------------------------
+
+
+class TestDetectAndFetch:
+    def test_greenhouse_full_flow(self, monkeypatch):
+        html = '<a href="https://boards.greenhouse.io/airbnb">Jobs</a>'
+        job_resp = {
+            "jobs": [{"absolute_url": "https://careers.airbnb.com/positions/1"}],
+            "meta": {"total": 5},
+        }
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, job_resp),
+        )
+        result = detect_and_fetch("https://www.airbnb.com", FakeClient(), homepage_html=html)
+        assert result is not None
+        assert result.ats_name == "greenhouse"
+        assert result.careers_url == "https://boards.greenhouse.io/airbnb"
+        assert result.position_url == "https://careers.airbnb.com/positions/1"
+        assert result.job_count == 5
+
+    def test_no_ats_returns_none(self, monkeypatch):
+        html = "<html><body>No ATS here</body></html>"
+        result = detect_and_fetch("https://www.example.com", FakeClient(), homepage_html=html)
+        assert result is None
+
+    def test_api_failure_returns_result_without_position_url(self, monkeypatch):
+        html = '<a href="https://jobs.lever.co/acme">Jobs</a>'
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(500, []),
+        )
+        result = detect_and_fetch("https://www.acme.com", FakeClient(), homepage_html=html)
+        assert result is not None
+        assert result.ats_name == "lever"
+        assert result.careers_url == "https://jobs.lever.co/acme"
+        assert result.position_url is None
+
+    def test_homepage_fetch_failure_returns_none(self, monkeypatch):
+        """When homepage_html is None and the fetch fails, return None."""
+        def boom(*a, **kw):
+            raise RuntimeError("connection refused")
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", boom)
+        result = detect_and_fetch("https://www.example.com", FakeClient())
+        assert result is None
+
+    def test_uses_provided_html_without_fetching(self, monkeypatch):
+        """If homepage_html is provided, request_with_retries is only called for the API."""
+        html = '<a href="https://jobs.ashbyhq.com/linear">Jobs</a>'
+        calls: list[str] = []
+        job_resp = {"jobs": [{"jobUrl": "https://jobs.ashbyhq.com/linear/xyz"}]}
+
+        def fake_req(client, method, url, **kw):
+            calls.append(url)
+            return FakeResponse(200, job_resp)
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = detect_and_fetch("https://www.linear.app", FakeClient(), homepage_html=html)
+        assert result is not None
+        # Only one call: the API fetch (not the homepage)
+        assert len(calls) == 1
+        assert "ashby" in calls[0]
+
+
+# ---------------------------------------------------------------------------
+# Pure-unit helpers
+# ---------------------------------------------------------------------------
+
+
+class TestDomainStem:
+    def test_strips_www(self):
+        assert _domain_stem("https://www.anthropic.com") == "anthropic"
+
+    def test_no_www(self):
+        assert _domain_stem("https://linear.app") == "linear"
+
+    def test_with_path(self):
+        assert _domain_stem("https://www.figma.com/careers/") == "figma"
+
+    def test_invalid_returns_none(self):
+        assert _domain_stem("") is None or isinstance(_domain_stem(""), (str, type(None)))
+
+
+class TestSlugCandidates:
+    def test_domain_stem_first(self):
+        candidates = _slug_candidates("https://www.anthropic.com", "Anthropic")
+        assert candidates[0] == "anthropic"
+
+    def test_deduplicates_stem_and_name(self):
+        # stem == normalized name → only one entry
+        candidates = _slug_candidates("https://www.anthropic.com", "Anthropic")
+        assert candidates.count("anthropic") == 1
+
+    def test_different_stem_and_name(self):
+        # stem differs from normalized name → both appear
+        candidates = _slug_candidates("https://www.acmecorp.com", "Acme Corp Inc")
+        assert "acmecorp" in candidates
+        assert "acmecorp" in candidates or "acmecorp" in candidates
+        # normalized name strips "Inc" → "acmecorp"
+
+    def test_name_only_candidate_when_stem_equal(self):
+        # When stem and slug match, only one entry
+        candidates = _slug_candidates("https://ramp.com", "Ramp")
+        assert len(candidates) == 1
+        assert candidates[0] == "ramp"
+
+    def test_no_company_name_uses_stem_only(self):
+        candidates = _slug_candidates("https://www.anthropic.com", None)
+        assert candidates == ["anthropic"]
+
+    def test_max_three_candidates(self):
+        # Can't produce more than 3
+        candidates = _slug_candidates("https://www.x.com", "X Corp Inc")
+        assert len(candidates) <= 3
+
+
+class TestLooseNameMatch:
+    def test_exact_match(self):
+        assert _loose_name_match("Anthropic", "Anthropic") is True
+
+    def test_one_substring_of_other(self):
+        assert _loose_name_match("Acme", "Acme Corp Inc") is True
+
+    def test_clear_mismatch(self):
+        assert _loose_name_match("Acme", "Globex") is False
+
+    def test_empty_input_returns_true(self):
+        assert _loose_name_match("", "Acme") is True
+
+    def test_empty_org_returns_true(self):
+        assert _loose_name_match("Acme", "") is True
+
+    def test_case_insensitive(self):
+        assert _loose_name_match("ANTHROPIC", "anthropic") is True
+
+
+# ---------------------------------------------------------------------------
+# recover_via_slug_guess
+# ---------------------------------------------------------------------------
+
+
+class TestRecoverViaSlugGuess:
+    """All tests drive the real _fetch_* via a URL-dispatching fake request_with_retries."""
+
+    def _gh_resp(self, slug: str, company_name: str, count: int = 5) -> dict:
+        """Canned Greenhouse response with jobs."""
+        return {
+            "jobs": [{"absolute_url": f"https://boards.greenhouse.io/{slug}/jobs/1",
+                      "company_name": company_name}],
+            "meta": {"total": count},
+        }
+
+    def _gh_empty(self) -> dict:
+        return {"jobs": [], "meta": {"total": 0}}
+
+    def _lever_resp(self, slug: str, count: int = 3) -> list:
+        return [{"hostedUrl": f"https://jobs.lever.co/{slug}/abc"}] * count
+
+    def _ashby_resp(self, slug: str, count: int = 2) -> dict:
+        return {"jobs": [{"jobUrl": f"https://jobs.ashbyhq.com/{slug}/xyz"}] * count}
+
+    def _ashby_empty(self) -> dict:
+        return {"jobs": []}
+
+    def _lever_empty(self) -> list:
+        return []
+
+    # ----- Domain-stem hit (Anthropic-style) -----
+
+    def test_domain_stem_greenhouse_hit(self, monkeypatch):
+        """Greenhouse slug derived from domain stem → returns ATSResult."""
+        def fake_req(client, method, url, **kw):
+            if "boards-api.greenhouse.io/v1/boards/anthropic" in url:
+                return FakeResponse(200, self._gh_resp("anthropic", "Anthropic", count=370))
+            # All other probes empty
+            if "lever.co" in url:
+                return FakeResponse(200, self._lever_empty())
+            if "ashby" in url:
+                return FakeResponse(200, self._ashby_empty())
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = recover_via_slug_guess(
+            "https://www.anthropic.com", "Anthropic", FakeClient()
+        )
+        assert result is not None
+        assert result.ats_name == "greenhouse"
+        assert result.careers_url == "https://boards.greenhouse.io/anthropic"
+        assert result.position_url is not None
+        assert result.job_count == 370
+
+    # ----- Name-candidate fallback when stem misses -----
+
+    def test_name_candidate_fallback(self, monkeypatch):
+        """Stem misses; _slug(company_name) slug hits on Lever."""
+        def fake_req(client, method, url, **kw):
+            # Stem slug "acmecorp" → greenhouse empty, lever empty, ashby empty
+            if "acmecorp" in url and "boards-api" in url:
+                return FakeResponse(200, self._gh_empty())
+            if "acmecorp" in url and "lever" in url:
+                return FakeResponse(200, self._lever_empty())
+            if "acmecorp" in url and "ashby" in url:
+                return FakeResponse(200, self._ashby_empty())
+            # Name slug "acme" → lever hit
+            if "acme" in url and "lever" in url:
+                return FakeResponse(200, self._lever_resp("acme"))
+            if "acme" in url and "boards-api" in url:
+                return FakeResponse(200, self._gh_empty())
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = recover_via_slug_guess(
+            "https://www.acmecorp.com", "Acme", FakeClient()
+        )
+        assert result is not None
+        assert result.ats_name == "lever"
+        assert "acme" in result.careers_url
+
+    # ----- 0-jobs reject -----
+
+    def test_zero_jobs_rejected(self, monkeypatch):
+        """All slugs resolve but job_count==0 everywhere → None."""
+        def fake_req(client, method, url, **kw):
+            if "boards-api" in url:
+                return FakeResponse(200, self._gh_empty())
+            if "lever" in url:
+                return FakeResponse(200, self._lever_empty())
+            if "ashby" in url:
+                return FakeResponse(200, self._ashby_empty())
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = recover_via_slug_guess(
+            "https://www.acme.com", "Acme", FakeClient()
+        )
+        assert result is None
+
+    # ----- Org-name mismatch reject (collision guard) -----
+
+    def test_org_name_mismatch_rejected(self, monkeypatch):
+        """Greenhouse returns jobs but org_name is a different company → skip to lever/ashby → miss."""
+        def fake_req(client, method, url, **kw):
+            if "boards-api" in url:
+                # Returns jobs but for wrong company
+                return FakeResponse(200, self._gh_resp("acme", "Globex Corporation", count=10))
+            if "lever" in url:
+                return FakeResponse(200, self._lever_empty())
+            if "ashby" in url:
+                return FakeResponse(200, self._ashby_empty())
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = recover_via_slug_guess(
+            "https://www.acme.com", "Acme Corp", FakeClient()
+        )
+        assert result is None
+
+    # ----- Lever hit (no org_name) accepted on job_count alone -----
+
+    def test_lever_hit_without_org_name_accepted(self, monkeypatch):
+        """Lever doesn't expose org_name → cross-check is skipped; job_count>0 wins."""
+        def fake_req(client, method, url, **kw):
+            if "boards-api" in url:
+                return FakeResponse(200, self._gh_empty())
+            if "lever" in url:
+                return FakeResponse(200, self._lever_resp("acme"))
+            if "ashby" in url:
+                return FakeResponse(200, self._ashby_empty())
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = recover_via_slug_guess(
+            "https://www.acme.com", "Completely Different Name", FakeClient()
+        )
+        # Lever returns jobs, no org_name → cross-check skipped → accepted
+        assert result is not None
+        assert result.ats_name == "lever"
+
+    # ----- Short-circuit: greenhouse hit stops remaining probes -----
+
+    def test_short_circuits_on_first_hit(self, monkeypatch):
+        """Once Greenhouse hits, Lever and Ashby are NOT probed."""
+        probed: list[str] = []
+
+        def fake_req(client, method, url, **kw):
+            probed.append(url)
+            if "boards-api.greenhouse.io/v1/boards/acme" in url:
+                return FakeResponse(200, self._gh_resp("acme", "Acme", count=5))
+            if "lever" in url:
+                return FakeResponse(200, self._lever_resp("acme"))
+            if "ashby" in url:
+                return FakeResponse(200, self._ashby_resp("acme"))
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = recover_via_slug_guess(
+            "https://www.acme.com", "Acme", FakeClient()
+        )
+        assert result is not None
+        assert result.ats_name == "greenhouse"
+        # Lever and Ashby URLs must not have been probed
+        assert not any("lever" in u for u in probed)
+        assert not any("ashby" in u for u in probed)
+
+    # ----- company_name=None: stem-only, cross-check skipped -----
+
+    def test_no_company_name_uses_stem_and_skips_crosscheck(self, monkeypatch):
+        """With company_name=None, use domain stem only; org_name cross-check skipped."""
+        def fake_req(client, method, url, **kw):
+            if "boards-api.greenhouse.io/v1/boards/acme" in url:
+                return FakeResponse(200, self._gh_resp("acme", "Some Other Company"))
+            if "lever" in url:
+                return FakeResponse(200, self._lever_empty())
+            if "ashby" in url:
+                return FakeResponse(200, self._ashby_empty())
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        # company_name=None → cross-check disabled even if org_name differs
+        result = recover_via_slug_guess(
+            "https://www.acme.com", None, FakeClient()
+        )
+        assert result is not None
+        assert result.ats_name == "greenhouse"
+
+    # ----- All-miss -----
+
+    def test_all_miss_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.ats.request_with_retries",
+            lambda *a, **kw: FakeResponse(404, {}),
+        )
+        result = recover_via_slug_guess(
+            "https://www.nobody.com", "Nobody Inc", FakeClient()
+        )
+        assert result is None
+
+    # ----- Network error on one probe falls through -----
+
+    def test_single_probe_error_falls_through(self, monkeypatch):
+        """A probe that raises should not abort recovery; others are still tried."""
+        call_count = [0]
+
+        def fake_req(client, method, url, **kw):
+            call_count[0] += 1
+            if "boards-api" in url:
+                raise RuntimeError("greenhouse down")
+            if "lever" in url:
+                return FakeResponse(200, self._lever_resp("acme"))
+            if "ashby" in url:
+                return FakeResponse(200, self._ashby_empty())
+            return FakeResponse(404, {})
+
+        monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req)
+        result = recover_via_slug_guess(
+            "https://www.acme.com", None, FakeClient()
+        )
+        assert result is not None
+        assert result.ats_name == "lever"
--- a/tests/test_cascade.py
+++ b/tests/test_cascade.py
@@ -0,0 +1,552 @@
+"""Tests for jobsource/careers/cascade.py — all network-free via monkeypatching."""
+from __future__ import annotations
+
+import pytest
+
+from jobsource.careers import CareersResult, find_careers_page
+from jobsource.careers.cascade import _detect_ats_in_page, _finalize, _safe_get_html
+from jobsource.careers.ats import ATSBoard, ATSFetch, ATSResult
+
+
+# ---------------------------------------------------------------------------
+# Fake helpers
+# ---------------------------------------------------------------------------
+
+
+class FakeResponse:
+    def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"):
+        self.status_code = status_code
+        self.text = text
+        self.url = url
+
+
+class FakeClient:
+    pass
+
+
+# ---------------------------------------------------------------------------
+# _detect_ats_in_page
+# ---------------------------------------------------------------------------
+
+
+class TestDetectATSInPage:
+    def test_returns_board_from_page_html(self, monkeypatch):
+        html = '<a href="https://boards.greenhouse.io/vercel">Jobs</a>'
+        monkeypatch.setattr(
+            "jobsource.careers.cascade.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, html),
+        )
+        board = _detect_ats_in_page("https://vercel.com/careers", FakeClient())
+        assert board is not None
+        assert board.ats_name == "greenhouse"
+        assert board.slug == "vercel"
+
+    def test_returns_none_on_404(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade.request_with_retries",
+            lambda *a, **kw: FakeResponse(404, ""),
+        )
+        assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None
+
+    def test_returns_none_on_exception(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade.request_with_retries",
+            lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")),
+        )
+        assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None
+
+    def test_returns_none_when_no_ats_in_page(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, "<html>no ats here</html>"),
+        )
+        assert _detect_ats_in_page("https://acme.com/careers", FakeClient()) is None
+
+
+# ---------------------------------------------------------------------------
+# _safe_get_html
+# ---------------------------------------------------------------------------
+
+
+class TestSafeGetHtml:
+    def test_returns_text_on_200(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, "page html"),
+        )
+        result = _safe_get_html("https://acme.com", FakeClient())
+        assert result == "page html"
+
+    def test_returns_none_on_404(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade.request_with_retries",
+            lambda *a, **kw: FakeResponse(404, ""),
+        )
+        result = _safe_get_html("https://acme.com", FakeClient())
+        assert result is None
+
+    def test_returns_none_on_exception(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade.request_with_retries",
+            lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("network error")),
+        )
+        result = _safe_get_html("https://acme.com", FakeClient())
+        assert result is None
+
+
+# ---------------------------------------------------------------------------
+# find_careers_page — tier ordering and early return
+# ---------------------------------------------------------------------------
+
+
+class TestCascadeTierOrdering:
+    def _patch_tiers(self, monkeypatch, *, ats=None, slug_guess=None,
+                     url_pattern=None, homepage=None, sitemap=None, html="<html/>"):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._safe_get_html",
+            lambda website, client: html,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_and_fetch",
+            lambda *a, **kw: ats,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.recover_via_slug_guess",
+            lambda *a, **kw: slug_guess,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.probe_url_patterns",
+            lambda *a, **kw: url_pattern,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.scan_homepage_links",
+            lambda *a, **kw: homepage,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.parse_sitemap",
+            lambda *a, **kw: sitemap,
+        )
+        # Also stub detect_ats_in_url so _finalize doesn't try to do network calls
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: None,
+        )
+
+    def test_ats_hit_returns_095_confidence(self, monkeypatch):
+        ats_result = ATSResult(
+            ats_name="greenhouse",
+            careers_url="https://boards.greenhouse.io/acme",
+            position_url="https://careers.acme.com/positions/1",
+            job_count=10,
+        )
+        self._patch_tiers(monkeypatch, ats=ats_result)
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.careers_url == "https://boards.greenhouse.io/acme"
+        assert result.confidence == 0.95
+        assert result.method == "ats:greenhouse"
+        assert result.ats_name == "greenhouse"
+        assert result.position_url == "https://careers.acme.com/positions/1"
+
+    def test_url_pattern_hit_when_ats_misses(self, monkeypatch):
+        self._patch_tiers(monkeypatch, url_pattern="https://acme.com/careers")
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.careers_url == "https://acme.com/careers"
+        assert result.confidence == 0.80
+        assert result.method == "url_pattern"
+        assert result.ats_name is None
+
+    def test_homepage_scan_hit_when_ats_and_url_pattern_miss(self, monkeypatch):
+        self._patch_tiers(monkeypatch, homepage="https://acme.com/careers")
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.careers_url == "https://acme.com/careers"
+        assert result.confidence == 0.60
+        assert result.method == "homepage_scan"
+
+    def test_sitemap_hit_when_all_else_misses(self, monkeypatch):
+        self._patch_tiers(monkeypatch, sitemap="https://acme.com/careers")
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.careers_url == "https://acme.com/careers"
+        assert result.confidence == 0.50
+        assert result.method == "sitemap"
+
+    def test_all_miss_returns_none_method(self, monkeypatch):
+        self._patch_tiers(monkeypatch)
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.careers_url is None
+        assert result.confidence == 0.0
+        assert result.method == "none"
+
+    def test_ats_hit_skips_later_tiers(self, monkeypatch):
+        """When ATS resolves, slug_guess/url_pattern/homepage/sitemap should not be called."""
+        ats_result = ATSResult(
+            ats_name="lever", careers_url="https://jobs.lever.co/acme",
+        )
+        later_called: list[str] = []
+
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._safe_get_html",
+            lambda *a, **kw: "<html/>",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_and_fetch",
+            lambda *a, **kw: ats_result,
+        )
+
+        def make_tracker(name):
+            def fn(*a, **kw):
+                later_called.append(name)
+                return None
+            return fn
+
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.recover_via_slug_guess",
+            make_tracker("slug_guess"),
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.probe_url_patterns",
+            make_tracker("url_pattern"),
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.scan_homepage_links",
+            make_tracker("homepage_scan"),
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.parse_sitemap",
+            make_tracker("sitemap"),
+        )
+
+        find_careers_page("https://www.acme.com", client=FakeClient())
+        assert later_called == []
+
+    def test_failing_tier_falls_through(self, monkeypatch):
+        """A tier that raises should not abort the cascade."""
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._safe_get_html",
+            lambda *a, **kw: "<html/>",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_and_fetch",
+            lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("ats exploded")),
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.recover_via_slug_guess",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.probe_url_patterns",
+            lambda *a, **kw: "https://acme.com/careers",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.scan_homepage_links",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.parse_sitemap",
+            lambda *a, **kw: None,
+        )
+
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.method == "url_pattern"
+        assert result.careers_url == "https://acme.com/careers"
+
+
+# ---------------------------------------------------------------------------
+# ATS-URL upgrade in _finalize
+# ---------------------------------------------------------------------------
+
+
+class TestFinalizeATSUpgrade:
+    def test_lever_url_upgrades_to_ats_lever(self, monkeypatch):
+        """When a heuristic finds a Lever URL, _finalize upgrades to ats:lever."""
+        from jobsource.careers.ats import ATSBoard
+
+        lever_board = ATSBoard(
+            ats_name="lever",
+            slug="acme",
+            careers_url="https://jobs.lever.co/acme",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: lever_board,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats._FETCH_DISPATCH",
+            {"lever": lambda board, client: ATSFetch(first_url="https://jobs.lever.co/acme/abc", job_count=5)},
+        )
+
+        result = _finalize(
+            "https://jobs.lever.co/acme", "homepage_scan", 0.60,
+            "https://www.acme.com", FakeClient(),
+        )
+        assert result.method == "ats:lever"
+        assert result.confidence == 0.95
+        assert result.position_url == "https://jobs.lever.co/acme/abc"
+        assert result.ats_name == "lever"
+
+    def test_non_ats_url_no_upgrade(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: None,
+        )
+        result = _finalize(
+            "https://acme.com/careers", "url_pattern", 0.80,
+            "https://www.acme.com", FakeClient(),
+        )
+        assert result.method == "url_pattern"
+        assert result.confidence == 0.80
+        assert result.position_url is None
+
+    def test_ats_upgrade_from_page_html(self, monkeypatch):
+        """URL pattern finds /careers; fetching that page reveals Greenhouse embed → upgrade."""
+        gh_board = ATSBoard(
+            ats_name="greenhouse",
+            slug="vercel",
+            careers_url="https://boards.greenhouse.io/vercel",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: None,  # URL string itself is not an ATS URL
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._detect_ats_in_page",
+            lambda url, client: gh_board,  # page HTML reveals Greenhouse
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats._FETCH_DISPATCH",
+            {"greenhouse": lambda board, client: ATSFetch(first_url="https://job-boards.greenhouse.io/vercel/jobs/123", job_count=73)},
+        )
+        result = _finalize(
+            "https://vercel.com/careers", "url_pattern", 0.80,
+            "https://vercel.com", FakeClient(),
+        )
+        assert result.method == "ats:greenhouse"
+        assert result.confidence == 0.95
+        assert result.careers_url == "https://boards.greenhouse.io/vercel"
+        assert result.position_url == "https://job-boards.greenhouse.io/vercel/jobs/123"
+        assert result.ats_name == "greenhouse"
+
+    def test_ats_upgrade_fetch_failure_falls_back_to_original(self, monkeypatch):
+        """If ATS fetch during upgrade fails, return the original heuristic result."""
+        from jobsource.careers.ats import ATSBoard
+
+        gh_board = ATSBoard(
+            ats_name="greenhouse",
+            slug="acme",
+            careers_url="https://boards.greenhouse.io/acme",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: gh_board,
+        )
+
+        def boom(board, client):
+            raise RuntimeError("api down")
+
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats._FETCH_DISPATCH",
+            {"greenhouse": boom},
+        )
+        result = _finalize(
+            "https://boards.greenhouse.io/acme", "sitemap", 0.50,
+            "https://www.acme.com", FakeClient(),
+        )
+        # Upgrade failed — returns the original heuristic result
+        assert result.method == "sitemap"
+        assert result.confidence == 0.50
+
+
+# ---------------------------------------------------------------------------
+# CareersResult model
+# ---------------------------------------------------------------------------
+
+
+class TestCareersResultModel:
+    def test_defaults(self):
+        r = CareersResult()
+        assert r.careers_url is None
+        assert r.confidence == 0.0
+        assert r.method == "none"
+        assert r.ats_name is None
+        assert r.position_url is None
+
+    def test_full(self):
+        r = CareersResult(
+            careers_url="https://boards.greenhouse.io/acme",
+            confidence=0.95,
+            method="ats:greenhouse",
+            ats_name="greenhouse",
+            position_url="https://careers.acme.com/positions/1",
+        )
+        assert r.careers_url == "https://boards.greenhouse.io/acme"
+        assert r.ats_name == "greenhouse"
+
+
+# ---------------------------------------------------------------------------
+# Tier 1b — slug-guess in cascade ordering
+# ---------------------------------------------------------------------------
+
+
+class TestSlugGuessTier:
+    """Tests that slug-guess wires correctly into the cascade between HTML-ATS and url_pattern."""
+
+    def _patch_for_slug_guess(self, monkeypatch, *, slug_guess_result=None,
+                              url_pattern=None):
+        """Patch cascade so HTML-ATS always misses; control slug_guess and url_pattern."""
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._safe_get_html",
+            lambda website, client: "<html/>",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_and_fetch",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.recover_via_slug_guess",
+            lambda *a, **kw: slug_guess_result,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.probe_url_patterns",
+            lambda *a, **kw: url_pattern,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.scan_homepage_links",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.parse_sitemap",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: None,
+        )
+
+    def test_slug_guess_hit_returns_090_confidence(self, monkeypatch):
+        """A slug-guess hit should produce confidence=0.90 and method=ats:{name}:slug_guess."""
+        slug_result = ATSResult(
+            ats_name="greenhouse",
+            careers_url="https://boards.greenhouse.io/anthropic",
+            position_url="https://boards.greenhouse.io/anthropic/jobs/1",
+            job_count=370,
+        )
+        self._patch_for_slug_guess(monkeypatch, slug_guess_result=slug_result)
+        result = find_careers_page(
+            "https://www.anthropic.com",
+            company_name="Anthropic",
+            client=FakeClient(),
+        )
+        assert result.confidence == 0.90
+        assert result.method == "ats:greenhouse:slug_guess"
+        assert result.careers_url == "https://boards.greenhouse.io/anthropic"
+        assert result.ats_name == "greenhouse"
+        assert result.position_url == "https://boards.greenhouse.io/anthropic/jobs/1"
+
+    def test_slug_guess_hit_blocks_url_pattern(self, monkeypatch):
+        """If slug-guess hits, url_pattern should not be called."""
+        url_pattern_called: list[bool] = []
+        slug_result = ATSResult(
+            ats_name="lever",
+            careers_url="https://jobs.lever.co/acme",
+            position_url="https://jobs.lever.co/acme/xyz",
+            job_count=5,
+        )
+        self._patch_for_slug_guess(
+            monkeypatch, slug_guess_result=slug_result, url_pattern="https://acme.com/careers"
+        )
+        # Override url_pattern with a tracker
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.probe_url_patterns",
+            lambda *a, **kw: url_pattern_called.append(True) or None,
+        )
+        find_careers_page("https://www.acme.com", client=FakeClient())
+        assert url_pattern_called == []
+
+    def test_slug_guess_miss_falls_through_to_url_pattern(self, monkeypatch):
+        """When slug-guess misses, the cascade continues to url_pattern."""
+        self._patch_for_slug_guess(
+            monkeypatch,
+            slug_guess_result=None,
+            url_pattern="https://acme.com/careers",
+        )
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.method == "url_pattern"
+        assert result.confidence == 0.80
+
+    def test_company_name_forwarded_to_slug_guess(self, monkeypatch):
+        """company_name must be passed through to recover_via_slug_guess."""
+        received: list[tuple] = []
+
+        def fake_recover(website, company_name, client):
+            received.append((website, company_name))
+            return None
+
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._safe_get_html",
+            lambda *a, **kw: "<html/>",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_and_fetch",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.recover_via_slug_guess",
+            fake_recover,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.probe_url_patterns",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.scan_homepage_links",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.parse_sitemap",
+            lambda *a, **kw: None,
+        )
+
+        find_careers_page(
+            "https://www.anthropic.com",
+            company_name="Anthropic",
+            client=FakeClient(),
+        )
+        assert len(received) == 1
+        assert received[0] == ("https://www.anthropic.com", "Anthropic")
+
+    def test_slug_guess_tier_error_falls_through(self, monkeypatch):
+        """A slug-guess tier that raises should not abort the cascade."""
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._safe_get_html",
+            lambda *a, **kw: "<html/>",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_and_fetch",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.recover_via_slug_guess",
+            lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("slug_guess exploded")),
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.probe_url_patterns",
+            lambda *a, **kw: "https://acme.com/careers",
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._ats.detect_ats_in_url",
+            lambda url: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.scan_homepage_links",
+            lambda *a, **kw: None,
+        )
+        monkeypatch.setattr(
+            "jobsource.careers.cascade._heuristics.parse_sitemap",
+            lambda *a, **kw: None,
+        )
+        result = find_careers_page("https://www.acme.com", client=FakeClient())
+        assert result.method == "url_pattern"
+        assert result.careers_url == "https://acme.com/careers"
--- a/tests/test_heuristics.py
+++ b/tests/test_heuristics.py
@@ -0,0 +1,425 @@
+"""Tests for jobsource/careers/heuristics.py — all network-free."""
+from __future__ import annotations
+
+import pytest
+
+from jobsource.careers.heuristics import (
+    _base_parts,
+    _is_plausible_careers_url,
+    _score_anchor,
+    parse_sitemap,
+    probe_url_patterns,
+    scan_homepage_links,
+)
+
+
+# ---------------------------------------------------------------------------
+# Tiny fake response helper
+# ---------------------------------------------------------------------------
+
+
+class FakeResponse:
+    def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"):
+        self.status_code = status_code
+        self.text = text
+        self.url = url
+
+    def json(self) -> object:
+        return {}
+
+
+class FakeClient:
+    pass
+
+
+# ---------------------------------------------------------------------------
+# _is_plausible_careers_url
+# ---------------------------------------------------------------------------
+
+
+class TestIsPlausibleCareersUrl:
+    def test_same_domain_clean_path(self):
+        assert _is_plausible_careers_url("https://netflix.com/careers", "https://netflix.com/careers") is True
+
+    def test_same_domain_deeper_path(self):
+        # Google /careers → /about/careers/applications/
+        assert _is_plausible_careers_url(
+            "https://google.com/careers",
+            "https://google.com/about/careers/applications/",
+        ) is True
+
+    def test_soft_404_notfound_rejected(self):
+        # Netflix SPA: /careers → /NotFound?prev=...
+        assert _is_plausible_careers_url(
+            "https://www.netflix.com/careers",
+            "https://www.netflix.com/NotFound?prev=https%3A%2F%2Fwww.netflix.com%2Fcareers",
+        ) is False
+
+    def test_soft_404_slash404_rejected(self):
+        assert _is_plausible_careers_url(
+            "https://example.com/jobs",
+            "https://example.com/404",
+        ) is False
+
+    def test_soft_404_not_found_hyphen_rejected(self):
+        assert _is_plausible_careers_url(
+            "https://example.com/jobs",
+            "https://example.com/not-found",
+        ) is False
+
+    def test_off_brand_cross_domain_rejected(self):
+        # Microsoft /careers → bing.com
+        assert _is_plausible_careers_url(
+            "https://www.microsoft.com/careers",
+            "https://www.bing.com?ref=aka&shorturl=abc",
+        ) is False
+
+    def test_on_brand_cross_domain_accepted(self):
+        # Amazon /careers → amazon.jobs
+        assert _is_plausible_careers_url(
+            "https://www.amazon.com/careers",
+            "https://amazon.jobs/en/",
+        ) is True
+
+    def test_workday_subdomain_accepted(self):
+        # nvidia.com/careers → nvidia.wd5.myworkdayjobs.com/...
+        assert _is_plausible_careers_url(
+            "https://www.nvidia.com/careers",
+            "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
+        ) is True
+
+    def test_short_brand_not_filtered(self):
+        # brand <= 3 chars: skip the off-brand check
+        assert _is_plausible_careers_url(
+            "https://ibm.com/careers",
+            "https://jobs.example.com/",
+        ) is True  # "ibm" len=3 → no cross-domain filter applied
+
+
+# ---------------------------------------------------------------------------
+# _base_parts
+# ---------------------------------------------------------------------------
+
+
+class TestBaseParts:
+    def test_strips_www(self):
+        scheme, host, root = _base_parts("https://www.acme.com/about")
+        assert scheme == "https"
+        assert host == "www.acme.com"
+        assert root == "acme.com"
+
+    def test_no_www(self):
+        _, _, root = _base_parts("https://acme.com")
+        assert root == "acme.com"
+
+    def test_subdomain_preserved_in_host(self):
+        _, host, root = _base_parts("https://careers.acme.com/jobs")
+        assert host == "careers.acme.com"
+        assert root == "careers.acme.com"  # www. stripping only
+
+    def test_http_scheme(self):
+        scheme, _, _ = _base_parts("http://acme.com")
+        assert scheme == "http"
+
+
+# ---------------------------------------------------------------------------
+# _score_anchor
+# ---------------------------------------------------------------------------
+
+
+class TestScoreAnchor:
+    def test_careers_href_high_score(self):
+        assert _score_anchor("/careers", "") > 2.0
+
+    def test_jobs_href_score(self):
+        assert _score_anchor("/jobs", "") > 0
+
+    def test_text_careers_adds_score(self):
+        score_with = _score_anchor("/x", "Careers")
+        score_without = _score_anchor("/x", "About")
+        assert score_with > score_without
+
+    def test_unrelated_href_and_text_zero(self):
+        assert _score_anchor("/about", "About us") == 0.0
+
+    def test_combined_href_and_text(self):
+        combined = _score_anchor("/careers", "Careers")
+        href_only = _score_anchor("/careers", "")
+        assert combined > href_only
+
+
+# ---------------------------------------------------------------------------
+# probe_url_patterns
+# ---------------------------------------------------------------------------
+
+
+class TestProbeUrlPatterns:
+    def test_returns_careers_path_on_hit(self, monkeypatch):
+        def fake_probe(client, url):
+            if url.endswith("/careers"):
+                return "https://acme.com/careers"
+            return None
+
+        monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
+        result = probe_url_patterns("https://acme.com", FakeClient())
+        assert result == "https://acme.com/careers"
+
+    def test_returns_none_when_all_miss(self, monkeypatch):
+        monkeypatch.setattr("jobsource.careers.heuristics.probe_url", lambda c, u: None)
+        result = probe_url_patterns("https://acme.com", FakeClient())
+        assert result is None
+
+    def test_returns_first_hit_not_second(self, monkeypatch):
+        hits = []
+
+        def fake_probe(client, url):
+            hits.append(url)
+            if url.endswith("/career"):
+                return "https://acme.com/career"
+            return None
+
+        monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
+        result = probe_url_patterns("https://acme.com", FakeClient())
+        assert result == "https://acme.com/career"
+        # /careers was probed first and missed; /career was the first hit
+        assert any("/careers" in u for u in hits)
+
+    def test_careers_subdomain_candidate_included(self, monkeypatch):
+        probed: list[str] = []
+
+        def fake_probe(client, url):
+            probed.append(url)
+            return None
+
+        monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
+        probe_url_patterns("https://www.acme.com", FakeClient())
+        assert any("careers.acme.com" in u for u in probed)
+        assert any("jobs.acme.com" in u for u in probed)
+
+    def test_jobs_subdomain_hit(self, monkeypatch):
+        def fake_probe(client, url):
+            if "jobs.acme.com" in url:
+                return "https://jobs.acme.com"
+            return None
+
+        monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
+        result = probe_url_patterns("https://www.acme.com", FakeClient())
+        assert result == "https://jobs.acme.com"
+
+    def test_soft_404_rejected_falls_through_to_none(self, monkeypatch):
+        # All candidates redirect to a NotFound page — should return None
+        monkeypatch.setattr(
+            "jobsource.careers.heuristics.probe_url",
+            lambda c, u: u.replace("/careers", "/NotFound").replace("/career", "/NotFound")
+                          .replace("/jobs", "/NotFound").replace("/join-us", "/NotFound")
+                          .replace("/join", "/NotFound"),
+        )
+        result = probe_url_patterns("https://www.netflix.com", FakeClient())
+        assert result is None
+
+    def test_off_brand_redirect_rejected(self, monkeypatch):
+        # /careers redirects to an off-brand domain → skip; later candidate hits
+        def fake_probe(client, url):
+            if url.endswith("/careers"):
+                return "https://www.bing.com?ref=aka"
+            if url.endswith("/jobs"):
+                return "https://acme.com/jobs"
+            return None
+
+        monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
+        result = probe_url_patterns("https://www.acme.com", FakeClient())
+        assert result == "https://acme.com/jobs"
+
+
+# ---------------------------------------------------------------------------
+# scan_homepage_links
+# ---------------------------------------------------------------------------
+
+
+HOMEPAGE_WITH_CAREERS = """
+<html><body>
+<nav>
+  <a href="/about">About</a>
+  <a href="/careers">Careers</a>
+  <a href="/blog">Blog</a>
+</nav>
+</body></html>
+"""
+
+HOMEPAGE_NO_CAREER_LINKS = """
+<html><body>
+<nav>
+  <a href="/about">About</a>
+  <a href="/blog">Blog</a>
+  <a href="/contact">Contact</a>
+</nav>
+</body></html>
+"""
+
+HOMEPAGE_RELATIVE_LINK = """
+<html><body>
+<a href="jobs">Open Jobs</a>
+</body></html>
+"""
+
+HOMEPAGE_ABSOLUTE_LEVER = """
+<html><body>
+<a href="https://jobs.lever.co/acme">Work with us</a>
+</body></html>
+"""
+
+
+class TestScanHomepageLinks:
+    def test_finds_careers_link(self):
+        result = scan_homepage_links(
+            "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_WITH_CAREERS
+        )
+        assert result is not None
+        assert "careers" in result
+
+    def test_no_career_links_returns_none(self):
+        result = scan_homepage_links(
+            "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_NO_CAREER_LINKS
+        )
+        assert result is None
+
+    def test_relative_href_resolved(self):
+        result = scan_homepage_links(
+            "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_RELATIVE_LINK
+        )
+        # "jobs" href + "Open Jobs" text should score above threshold
+        assert result is not None
+        assert result.startswith("https://acme.com")
+
+    def test_absolute_external_link_preserved(self):
+        result = scan_homepage_links(
+            "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_ABSOLUTE_LEVER
+        )
+        assert result == "https://jobs.lever.co/acme"
+
+    def test_empty_html_returns_none(self):
+        result = scan_homepage_links(
+            "https://acme.com", FakeClient(), homepage_html=""
+        )
+        assert result is None
+
+    def test_fetch_failure_returns_none(self, monkeypatch):
+        def boom(*a, **kw):
+            raise RuntimeError("connection refused")
+        monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom)
+        result = scan_homepage_links("https://acme.com", FakeClient())
+        assert result is None
+
+    def test_http_error_status_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.heuristics.request_with_retries",
+            lambda *a, **kw: FakeResponse(500, "error"),
+        )
+        result = scan_homepage_links("https://acme.com", FakeClient())
+        assert result is None
+
+    def test_skips_mailto_links(self):
+        html = '<html><body><a href="mailto:jobs@acme.com">Email us</a></body></html>'
+        result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
+        assert result is None
+
+    def test_skips_fragment_only_links(self):
+        html = '<html><body><a href="#careers">Careers</a></body></html>'
+        result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
+        assert result is None
+
+    def test_prefers_href_careers_over_unrelated_text(self):
+        # /careers in href scores; /about with neutral text should score lower.
+        html = """
+        <html><body>
+          <a href="/about">Company information</a>
+          <a href="/careers">Some random text</a>
+        </body></html>
+        """
+        result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
+        assert result is not None
+        assert "careers" in result
+
+
+# ---------------------------------------------------------------------------
+# parse_sitemap
+# ---------------------------------------------------------------------------
+
+
+SIMPLE_SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://acme.com/about</loc></url>
+  <url><loc>https://acme.com/careers</loc></url>
+  <url><loc>https://acme.com/blog</loc></url>
+</urlset>"""
+
+SITEMAP_NO_CAREERS = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://acme.com/about</loc></url>
+  <url><loc>https://acme.com/blog</loc></url>
+</urlset>"""
+
+SITEMAP_INDEX = """<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <sitemap><loc>https://acme.com/sitemap-pages.xml</loc></sitemap>
+  <sitemap><loc>https://acme.com/sitemap-jobs.xml</loc></sitemap>
+</sitemapindex>"""
+
+SITEMAP_JOBS_CHILD = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://acme.com/jobs/senior-engineer</loc></url>
+</urlset>"""
+
+SITEMAP_PAGES_CHILD = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://acme.com/about</loc></url>
+</urlset>"""
+
+
+class TestParseSitemap:
+    def test_finds_careers_url(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.heuristics.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, SIMPLE_SITEMAP, "https://acme.com/sitemap.xml"),
+        )
+        result = parse_sitemap("https://acme.com", FakeClient())
+        assert result == "https://acme.com/careers"
+
+    def test_no_careers_url_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.heuristics.request_with_retries",
+            lambda *a, **kw: FakeResponse(200, SITEMAP_NO_CAREERS),
+        )
+        result = parse_sitemap("https://acme.com", FakeClient())
+        assert result is None
+
+    def test_404_returns_none(self, monkeypatch):
+        monkeypatch.setattr(
+            "jobsource.careers.heuristics.request_with_retries",
+            lambda *a, **kw: FakeResponse(404, ""),
+        )
+        result = parse_sitemap("https://acme.com", FakeClient())
+        assert result is None
+
+    def test_network_error_returns_none(self, monkeypatch):
+        def boom(*a, **kw):
+            raise RuntimeError("timeout")
+        monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom)
+        result = parse_sitemap("https://acme.com", FakeClient())
+        assert result is None
+
+    def test_sitemap_index_fetches_children(self, monkeypatch):
+        responses = {
+            "https://acme.com/sitemap.xml": FakeResponse(200, SITEMAP_INDEX),
+            "https://acme.com/sitemap-pages.xml": FakeResponse(200, SITEMAP_PAGES_CHILD),
+            "https://acme.com/sitemap-jobs.xml": FakeResponse(200, SITEMAP_JOBS_CHILD),
+        }
+
+        def fake_req(client, method, url, **kw):
+            # Strip query params for lookup
+            base_url = url.split("?")[0]
+            return responses.get(base_url, FakeResponse(404, ""))
+
+        monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", fake_req)
+        result = parse_sitemap("https://acme.com", FakeClient())
+        assert result == "https://acme.com/jobs/senior-engineer"
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -102,7 +102,9 @@ def test_job_result_is_complete() -> None:
 def test_settings_load_defaults() -> None:
    from jobsource.config import Settings

-    s = Settings()
+    # _env_file=None suppresses .env loading for this instance so we see the
+    # coded defaults, not whatever the operator has set in the real .env file.
+    s = Settings(_env_file=None)
    assert s.job_source == "jobspy"
    assert s.batch_size == 20
    assert s.hours_old == 72