"""Tests for jobsource/careers/cascade.py — all network-free via monkeypatching.""" from __future__ import annotations import pytest from jobsource.careers import CareersResult, find_careers_page from jobsource.careers.cascade import _detect_ats_in_page, _finalize, _safe_get_html from jobsource.careers.ats import ATSBoard, ATSFetch, ATSResult # --------------------------------------------------------------------------- # Fake helpers # --------------------------------------------------------------------------- class FakeResponse: def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"): self.status_code = status_code self.text = text self.url = url class FakeClient: pass # --------------------------------------------------------------------------- # _detect_ats_in_page # --------------------------------------------------------------------------- class TestDetectATSInPage: def test_returns_board_from_page_html(self, monkeypatch): html = 'Jobs' monkeypatch.setattr( "jobsource.careers.cascade.request_with_retries", lambda *a, **kw: FakeResponse(200, html), ) board = _detect_ats_in_page("https://vercel.com/careers", FakeClient()) assert board is not None assert board.ats_name == "greenhouse" assert board.slug == "vercel" def test_returns_none_on_404(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.cascade.request_with_retries", lambda *a, **kw: FakeResponse(404, ""), ) assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None def test_returns_none_on_exception(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.cascade.request_with_retries", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")), ) assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None def test_returns_none_when_no_ats_in_page(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.cascade.request_with_retries", lambda *a, **kw: FakeResponse(200, "no ats here"), ) assert _detect_ats_in_page("https://acme.com/careers", FakeClient()) is None # --------------------------------------------------------------------------- # _safe_get_html # --------------------------------------------------------------------------- class TestSafeGetHtml: def test_returns_text_on_200(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.cascade.request_with_retries", lambda *a, **kw: FakeResponse(200, "page html"), ) result = _safe_get_html("https://acme.com", FakeClient()) assert result == "page html" def test_returns_none_on_404(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.cascade.request_with_retries", lambda *a, **kw: FakeResponse(404, ""), ) result = _safe_get_html("https://acme.com", FakeClient()) assert result is None def test_returns_none_on_exception(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.cascade.request_with_retries", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("network error")), ) result = _safe_get_html("https://acme.com", FakeClient()) assert result is None # --------------------------------------------------------------------------- # find_careers_page — tier ordering and early return # --------------------------------------------------------------------------- class TestCascadeTierOrdering: def _patch_tiers(self, monkeypatch, *, ats=None, slug_guess=None, url_pattern=None, homepage=None, sitemap=None, html=""): monkeypatch.setattr( "jobsource.careers.cascade._safe_get_html", lambda website, client: html, ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_and_fetch", lambda *a, **kw: ats, ) monkeypatch.setattr( "jobsource.careers.cascade._ats.recover_via_slug_guess", lambda *a, **kw: slug_guess, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.probe_url_patterns", lambda *a, **kw: url_pattern, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.scan_homepage_links", lambda *a, **kw: homepage, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.parse_sitemap", lambda *a, **kw: sitemap, ) # Also stub detect_ats_in_url so _finalize doesn't try to do network calls monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: None, ) def test_ats_hit_returns_095_confidence(self, monkeypatch): ats_result = ATSResult( ats_name="greenhouse", careers_url="https://boards.greenhouse.io/acme", position_url="https://careers.acme.com/positions/1", job_count=10, ) self._patch_tiers(monkeypatch, ats=ats_result) result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.careers_url == "https://boards.greenhouse.io/acme" assert result.confidence == 0.95 assert result.method == "ats:greenhouse" assert result.ats_name == "greenhouse" assert result.position_url == "https://careers.acme.com/positions/1" def test_url_pattern_hit_when_ats_misses(self, monkeypatch): self._patch_tiers(monkeypatch, url_pattern="https://acme.com/careers") result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.careers_url == "https://acme.com/careers" assert result.confidence == 0.80 assert result.method == "url_pattern" assert result.ats_name is None def test_homepage_scan_hit_when_ats_and_url_pattern_miss(self, monkeypatch): self._patch_tiers(monkeypatch, homepage="https://acme.com/careers") result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.careers_url == "https://acme.com/careers" assert result.confidence == 0.60 assert result.method == "homepage_scan" def test_sitemap_hit_when_all_else_misses(self, monkeypatch): self._patch_tiers(monkeypatch, sitemap="https://acme.com/careers") result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.careers_url == "https://acme.com/careers" assert result.confidence == 0.50 assert result.method == "sitemap" def test_all_miss_returns_none_method(self, monkeypatch): self._patch_tiers(monkeypatch) result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.careers_url is None assert result.confidence == 0.0 assert result.method == "none" def test_ats_hit_skips_later_tiers(self, monkeypatch): """When ATS resolves, slug_guess/url_pattern/homepage/sitemap should not be called.""" ats_result = ATSResult( ats_name="lever", careers_url="https://jobs.lever.co/acme", ) later_called: list[str] = [] monkeypatch.setattr( "jobsource.careers.cascade._safe_get_html", lambda *a, **kw: "", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_and_fetch", lambda *a, **kw: ats_result, ) def make_tracker(name): def fn(*a, **kw): later_called.append(name) return None return fn monkeypatch.setattr( "jobsource.careers.cascade._ats.recover_via_slug_guess", make_tracker("slug_guess"), ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.probe_url_patterns", make_tracker("url_pattern"), ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.scan_homepage_links", make_tracker("homepage_scan"), ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.parse_sitemap", make_tracker("sitemap"), ) find_careers_page("https://www.acme.com", client=FakeClient()) assert later_called == [] def test_failing_tier_falls_through(self, monkeypatch): """A tier that raises should not abort the cascade.""" monkeypatch.setattr( "jobsource.careers.cascade._safe_get_html", lambda *a, **kw: "", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_and_fetch", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("ats exploded")), ) monkeypatch.setattr( "jobsource.careers.cascade._ats.recover_via_slug_guess", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.probe_url_patterns", lambda *a, **kw: "https://acme.com/careers", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.scan_homepage_links", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.parse_sitemap", lambda *a, **kw: None, ) result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.method == "url_pattern" assert result.careers_url == "https://acme.com/careers" # --------------------------------------------------------------------------- # ATS-URL upgrade in _finalize # --------------------------------------------------------------------------- class TestFinalizeATSUpgrade: def test_lever_url_upgrades_to_ats_lever(self, monkeypatch): """When a heuristic finds a Lever URL, _finalize upgrades to ats:lever.""" from jobsource.careers.ats import ATSBoard lever_board = ATSBoard( ats_name="lever", slug="acme", careers_url="https://jobs.lever.co/acme", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: lever_board, ) monkeypatch.setattr( "jobsource.careers.cascade._ats._FETCH_DISPATCH", {"lever": lambda board, client: ATSFetch(first_url="https://jobs.lever.co/acme/abc", job_count=5)}, ) result = _finalize( "https://jobs.lever.co/acme", "homepage_scan", 0.60, "https://www.acme.com", FakeClient(), ) assert result.method == "ats:lever" assert result.confidence == 0.95 assert result.position_url == "https://jobs.lever.co/acme/abc" assert result.ats_name == "lever" def test_non_ats_url_no_upgrade(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: None, ) result = _finalize( "https://acme.com/careers", "url_pattern", 0.80, "https://www.acme.com", FakeClient(), ) assert result.method == "url_pattern" assert result.confidence == 0.80 assert result.position_url is None def test_ats_upgrade_from_page_html(self, monkeypatch): """URL pattern finds /careers; fetching that page reveals Greenhouse embed → upgrade.""" gh_board = ATSBoard( ats_name="greenhouse", slug="vercel", careers_url="https://boards.greenhouse.io/vercel", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: None, # URL string itself is not an ATS URL ) monkeypatch.setattr( "jobsource.careers.cascade._detect_ats_in_page", lambda url, client: gh_board, # page HTML reveals Greenhouse ) monkeypatch.setattr( "jobsource.careers.cascade._ats._FETCH_DISPATCH", {"greenhouse": lambda board, client: ATSFetch(first_url="https://job-boards.greenhouse.io/vercel/jobs/123", job_count=73)}, ) result = _finalize( "https://vercel.com/careers", "url_pattern", 0.80, "https://vercel.com", FakeClient(), ) assert result.method == "ats:greenhouse" assert result.confidence == 0.95 assert result.careers_url == "https://boards.greenhouse.io/vercel" assert result.position_url == "https://job-boards.greenhouse.io/vercel/jobs/123" assert result.ats_name == "greenhouse" def test_ats_upgrade_fetch_failure_falls_back_to_original(self, monkeypatch): """If ATS fetch during upgrade fails, return the original heuristic result.""" from jobsource.careers.ats import ATSBoard gh_board = ATSBoard( ats_name="greenhouse", slug="acme", careers_url="https://boards.greenhouse.io/acme", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: gh_board, ) def boom(board, client): raise RuntimeError("api down") monkeypatch.setattr( "jobsource.careers.cascade._ats._FETCH_DISPATCH", {"greenhouse": boom}, ) result = _finalize( "https://boards.greenhouse.io/acme", "sitemap", 0.50, "https://www.acme.com", FakeClient(), ) # Upgrade failed — returns the original heuristic result assert result.method == "sitemap" assert result.confidence == 0.50 # --------------------------------------------------------------------------- # CareersResult model # --------------------------------------------------------------------------- class TestCareersResultModel: def test_defaults(self): r = CareersResult() assert r.careers_url is None assert r.confidence == 0.0 assert r.method == "none" assert r.ats_name is None assert r.position_url is None def test_full(self): r = CareersResult( careers_url="https://boards.greenhouse.io/acme", confidence=0.95, method="ats:greenhouse", ats_name="greenhouse", position_url="https://careers.acme.com/positions/1", ) assert r.careers_url == "https://boards.greenhouse.io/acme" assert r.ats_name == "greenhouse" # --------------------------------------------------------------------------- # Tier 1b — slug-guess in cascade ordering # --------------------------------------------------------------------------- class TestSlugGuessTier: """Tests that slug-guess wires correctly into the cascade between HTML-ATS and url_pattern.""" def _patch_for_slug_guess(self, monkeypatch, *, slug_guess_result=None, url_pattern=None): """Patch cascade so HTML-ATS always misses; control slug_guess and url_pattern.""" monkeypatch.setattr( "jobsource.careers.cascade._safe_get_html", lambda website, client: "", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_and_fetch", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._ats.recover_via_slug_guess", lambda *a, **kw: slug_guess_result, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.probe_url_patterns", lambda *a, **kw: url_pattern, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.scan_homepage_links", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.parse_sitemap", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: None, ) def test_slug_guess_hit_returns_090_confidence(self, monkeypatch): """A slug-guess hit should produce confidence=0.90 and method=ats:{name}:slug_guess.""" slug_result = ATSResult( ats_name="greenhouse", careers_url="https://boards.greenhouse.io/anthropic", position_url="https://boards.greenhouse.io/anthropic/jobs/1", job_count=370, ) self._patch_for_slug_guess(monkeypatch, slug_guess_result=slug_result) result = find_careers_page( "https://www.anthropic.com", company_name="Anthropic", client=FakeClient(), ) assert result.confidence == 0.90 assert result.method == "ats:greenhouse:slug_guess" assert result.careers_url == "https://boards.greenhouse.io/anthropic" assert result.ats_name == "greenhouse" assert result.position_url == "https://boards.greenhouse.io/anthropic/jobs/1" def test_slug_guess_hit_blocks_url_pattern(self, monkeypatch): """If slug-guess hits, url_pattern should not be called.""" url_pattern_called: list[bool] = [] slug_result = ATSResult( ats_name="lever", careers_url="https://jobs.lever.co/acme", position_url="https://jobs.lever.co/acme/xyz", job_count=5, ) self._patch_for_slug_guess( monkeypatch, slug_guess_result=slug_result, url_pattern="https://acme.com/careers" ) # Override url_pattern with a tracker monkeypatch.setattr( "jobsource.careers.cascade._heuristics.probe_url_patterns", lambda *a, **kw: url_pattern_called.append(True) or None, ) find_careers_page("https://www.acme.com", client=FakeClient()) assert url_pattern_called == [] def test_slug_guess_miss_falls_through_to_url_pattern(self, monkeypatch): """When slug-guess misses, the cascade continues to url_pattern.""" self._patch_for_slug_guess( monkeypatch, slug_guess_result=None, url_pattern="https://acme.com/careers", ) result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.method == "url_pattern" assert result.confidence == 0.80 def test_company_name_forwarded_to_slug_guess(self, monkeypatch): """company_name must be passed through to recover_via_slug_guess.""" received: list[tuple] = [] def fake_recover(website, company_name, client): received.append((website, company_name)) return None monkeypatch.setattr( "jobsource.careers.cascade._safe_get_html", lambda *a, **kw: "", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_and_fetch", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._ats.recover_via_slug_guess", fake_recover, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.probe_url_patterns", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.scan_homepage_links", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.parse_sitemap", lambda *a, **kw: None, ) find_careers_page( "https://www.anthropic.com", company_name="Anthropic", client=FakeClient(), ) assert len(received) == 1 assert received[0] == ("https://www.anthropic.com", "Anthropic") def test_slug_guess_tier_error_falls_through(self, monkeypatch): """A slug-guess tier that raises should not abort the cascade.""" monkeypatch.setattr( "jobsource.careers.cascade._safe_get_html", lambda *a, **kw: "", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_and_fetch", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._ats.recover_via_slug_guess", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("slug_guess exploded")), ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.probe_url_patterns", lambda *a, **kw: "https://acme.com/careers", ) monkeypatch.setattr( "jobsource.careers.cascade._ats.detect_ats_in_url", lambda url: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.scan_homepage_links", lambda *a, **kw: None, ) monkeypatch.setattr( "jobsource.careers.cascade._heuristics.parse_sitemap", lambda *a, **kw: None, ) result = find_careers_page("https://www.acme.com", client=FakeClient()) assert result.method == "url_pattern" assert result.careers_url == "https://acme.com/careers"