Files
JobSourceAgent/tests/test_cascade.py

553 lines
21 KiB
Python

"""Tests for jobsource/careers/cascade.py — all network-free via monkeypatching."""
from __future__ import annotations
import pytest
from jobsource.careers import CareersResult, find_careers_page
from jobsource.careers.cascade import _detect_ats_in_page, _finalize, _safe_get_html
from jobsource.careers.ats import ATSBoard, ATSFetch, ATSResult
# ---------------------------------------------------------------------------
# Fake helpers
# ---------------------------------------------------------------------------
class FakeResponse:
def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"):
self.status_code = status_code
self.text = text
self.url = url
class FakeClient:
pass
# ---------------------------------------------------------------------------
# _detect_ats_in_page
# ---------------------------------------------------------------------------
class TestDetectATSInPage:
def test_returns_board_from_page_html(self, monkeypatch):
html = '<a href="https://boards.greenhouse.io/vercel">Jobs</a>'
monkeypatch.setattr(
"jobsource.careers.cascade.request_with_retries",
lambda *a, **kw: FakeResponse(200, html),
)
board = _detect_ats_in_page("https://vercel.com/careers", FakeClient())
assert board is not None
assert board.ats_name == "greenhouse"
assert board.slug == "vercel"
def test_returns_none_on_404(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.cascade.request_with_retries",
lambda *a, **kw: FakeResponse(404, ""),
)
assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None
def test_returns_none_on_exception(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.cascade.request_with_retries",
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")),
)
assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None
def test_returns_none_when_no_ats_in_page(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.cascade.request_with_retries",
lambda *a, **kw: FakeResponse(200, "<html>no ats here</html>"),
)
assert _detect_ats_in_page("https://acme.com/careers", FakeClient()) is None
# ---------------------------------------------------------------------------
# _safe_get_html
# ---------------------------------------------------------------------------
class TestSafeGetHtml:
def test_returns_text_on_200(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.cascade.request_with_retries",
lambda *a, **kw: FakeResponse(200, "page html"),
)
result = _safe_get_html("https://acme.com", FakeClient())
assert result == "page html"
def test_returns_none_on_404(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.cascade.request_with_retries",
lambda *a, **kw: FakeResponse(404, ""),
)
result = _safe_get_html("https://acme.com", FakeClient())
assert result is None
def test_returns_none_on_exception(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.cascade.request_with_retries",
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("network error")),
)
result = _safe_get_html("https://acme.com", FakeClient())
assert result is None
# ---------------------------------------------------------------------------
# find_careers_page — tier ordering and early return
# ---------------------------------------------------------------------------
class TestCascadeTierOrdering:
def _patch_tiers(self, monkeypatch, *, ats=None, slug_guess=None,
url_pattern=None, homepage=None, sitemap=None, html="<html/>"):
monkeypatch.setattr(
"jobsource.careers.cascade._safe_get_html",
lambda website, client: html,
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_and_fetch",
lambda *a, **kw: ats,
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.recover_via_slug_guess",
lambda *a, **kw: slug_guess,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.probe_url_patterns",
lambda *a, **kw: url_pattern,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.scan_homepage_links",
lambda *a, **kw: homepage,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.parse_sitemap",
lambda *a, **kw: sitemap,
)
# Also stub detect_ats_in_url so _finalize doesn't try to do network calls
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: None,
)
def test_ats_hit_returns_095_confidence(self, monkeypatch):
ats_result = ATSResult(
ats_name="greenhouse",
careers_url="https://boards.greenhouse.io/acme",
position_url="https://careers.acme.com/positions/1",
job_count=10,
)
self._patch_tiers(monkeypatch, ats=ats_result)
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.careers_url == "https://boards.greenhouse.io/acme"
assert result.confidence == 0.95
assert result.method == "ats:greenhouse"
assert result.ats_name == "greenhouse"
assert result.position_url == "https://careers.acme.com/positions/1"
def test_url_pattern_hit_when_ats_misses(self, monkeypatch):
self._patch_tiers(monkeypatch, url_pattern="https://acme.com/careers")
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.careers_url == "https://acme.com/careers"
assert result.confidence == 0.80
assert result.method == "url_pattern"
assert result.ats_name is None
def test_homepage_scan_hit_when_ats_and_url_pattern_miss(self, monkeypatch):
self._patch_tiers(monkeypatch, homepage="https://acme.com/careers")
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.careers_url == "https://acme.com/careers"
assert result.confidence == 0.60
assert result.method == "homepage_scan"
def test_sitemap_hit_when_all_else_misses(self, monkeypatch):
self._patch_tiers(monkeypatch, sitemap="https://acme.com/careers")
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.careers_url == "https://acme.com/careers"
assert result.confidence == 0.50
assert result.method == "sitemap"
def test_all_miss_returns_none_method(self, monkeypatch):
self._patch_tiers(monkeypatch)
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.careers_url is None
assert result.confidence == 0.0
assert result.method == "none"
def test_ats_hit_skips_later_tiers(self, monkeypatch):
"""When ATS resolves, slug_guess/url_pattern/homepage/sitemap should not be called."""
ats_result = ATSResult(
ats_name="lever", careers_url="https://jobs.lever.co/acme",
)
later_called: list[str] = []
monkeypatch.setattr(
"jobsource.careers.cascade._safe_get_html",
lambda *a, **kw: "<html/>",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_and_fetch",
lambda *a, **kw: ats_result,
)
def make_tracker(name):
def fn(*a, **kw):
later_called.append(name)
return None
return fn
monkeypatch.setattr(
"jobsource.careers.cascade._ats.recover_via_slug_guess",
make_tracker("slug_guess"),
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.probe_url_patterns",
make_tracker("url_pattern"),
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.scan_homepage_links",
make_tracker("homepage_scan"),
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.parse_sitemap",
make_tracker("sitemap"),
)
find_careers_page("https://www.acme.com", client=FakeClient())
assert later_called == []
def test_failing_tier_falls_through(self, monkeypatch):
"""A tier that raises should not abort the cascade."""
monkeypatch.setattr(
"jobsource.careers.cascade._safe_get_html",
lambda *a, **kw: "<html/>",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_and_fetch",
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("ats exploded")),
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.recover_via_slug_guess",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.probe_url_patterns",
lambda *a, **kw: "https://acme.com/careers",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.scan_homepage_links",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.parse_sitemap",
lambda *a, **kw: None,
)
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.method == "url_pattern"
assert result.careers_url == "https://acme.com/careers"
# ---------------------------------------------------------------------------
# ATS-URL upgrade in _finalize
# ---------------------------------------------------------------------------
class TestFinalizeATSUpgrade:
def test_lever_url_upgrades_to_ats_lever(self, monkeypatch):
"""When a heuristic finds a Lever URL, _finalize upgrades to ats:lever."""
from jobsource.careers.ats import ATSBoard
lever_board = ATSBoard(
ats_name="lever",
slug="acme",
careers_url="https://jobs.lever.co/acme",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: lever_board,
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats._FETCH_DISPATCH",
{"lever": lambda board, client: ATSFetch(first_url="https://jobs.lever.co/acme/abc", job_count=5)},
)
result = _finalize(
"https://jobs.lever.co/acme", "homepage_scan", 0.60,
"https://www.acme.com", FakeClient(),
)
assert result.method == "ats:lever"
assert result.confidence == 0.95
assert result.position_url == "https://jobs.lever.co/acme/abc"
assert result.ats_name == "lever"
def test_non_ats_url_no_upgrade(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: None,
)
result = _finalize(
"https://acme.com/careers", "url_pattern", 0.80,
"https://www.acme.com", FakeClient(),
)
assert result.method == "url_pattern"
assert result.confidence == 0.80
assert result.position_url is None
def test_ats_upgrade_from_page_html(self, monkeypatch):
"""URL pattern finds /careers; fetching that page reveals Greenhouse embed → upgrade."""
gh_board = ATSBoard(
ats_name="greenhouse",
slug="vercel",
careers_url="https://boards.greenhouse.io/vercel",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: None, # URL string itself is not an ATS URL
)
monkeypatch.setattr(
"jobsource.careers.cascade._detect_ats_in_page",
lambda url, client: gh_board, # page HTML reveals Greenhouse
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats._FETCH_DISPATCH",
{"greenhouse": lambda board, client: ATSFetch(first_url="https://job-boards.greenhouse.io/vercel/jobs/123", job_count=73)},
)
result = _finalize(
"https://vercel.com/careers", "url_pattern", 0.80,
"https://vercel.com", FakeClient(),
)
assert result.method == "ats:greenhouse"
assert result.confidence == 0.95
assert result.careers_url == "https://boards.greenhouse.io/vercel"
assert result.position_url == "https://job-boards.greenhouse.io/vercel/jobs/123"
assert result.ats_name == "greenhouse"
def test_ats_upgrade_fetch_failure_falls_back_to_original(self, monkeypatch):
"""If ATS fetch during upgrade fails, return the original heuristic result."""
from jobsource.careers.ats import ATSBoard
gh_board = ATSBoard(
ats_name="greenhouse",
slug="acme",
careers_url="https://boards.greenhouse.io/acme",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: gh_board,
)
def boom(board, client):
raise RuntimeError("api down")
monkeypatch.setattr(
"jobsource.careers.cascade._ats._FETCH_DISPATCH",
{"greenhouse": boom},
)
result = _finalize(
"https://boards.greenhouse.io/acme", "sitemap", 0.50,
"https://www.acme.com", FakeClient(),
)
# Upgrade failed — returns the original heuristic result
assert result.method == "sitemap"
assert result.confidence == 0.50
# ---------------------------------------------------------------------------
# CareersResult model
# ---------------------------------------------------------------------------
class TestCareersResultModel:
def test_defaults(self):
r = CareersResult()
assert r.careers_url is None
assert r.confidence == 0.0
assert r.method == "none"
assert r.ats_name is None
assert r.position_url is None
def test_full(self):
r = CareersResult(
careers_url="https://boards.greenhouse.io/acme",
confidence=0.95,
method="ats:greenhouse",
ats_name="greenhouse",
position_url="https://careers.acme.com/positions/1",
)
assert r.careers_url == "https://boards.greenhouse.io/acme"
assert r.ats_name == "greenhouse"
# ---------------------------------------------------------------------------
# Tier 1b — slug-guess in cascade ordering
# ---------------------------------------------------------------------------
class TestSlugGuessTier:
"""Tests that slug-guess wires correctly into the cascade between HTML-ATS and url_pattern."""
def _patch_for_slug_guess(self, monkeypatch, *, slug_guess_result=None,
url_pattern=None):
"""Patch cascade so HTML-ATS always misses; control slug_guess and url_pattern."""
monkeypatch.setattr(
"jobsource.careers.cascade._safe_get_html",
lambda website, client: "<html/>",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_and_fetch",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.recover_via_slug_guess",
lambda *a, **kw: slug_guess_result,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.probe_url_patterns",
lambda *a, **kw: url_pattern,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.scan_homepage_links",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.parse_sitemap",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: None,
)
def test_slug_guess_hit_returns_090_confidence(self, monkeypatch):
"""A slug-guess hit should produce confidence=0.90 and method=ats:{name}:slug_guess."""
slug_result = ATSResult(
ats_name="greenhouse",
careers_url="https://boards.greenhouse.io/anthropic",
position_url="https://boards.greenhouse.io/anthropic/jobs/1",
job_count=370,
)
self._patch_for_slug_guess(monkeypatch, slug_guess_result=slug_result)
result = find_careers_page(
"https://www.anthropic.com",
company_name="Anthropic",
client=FakeClient(),
)
assert result.confidence == 0.90
assert result.method == "ats:greenhouse:slug_guess"
assert result.careers_url == "https://boards.greenhouse.io/anthropic"
assert result.ats_name == "greenhouse"
assert result.position_url == "https://boards.greenhouse.io/anthropic/jobs/1"
def test_slug_guess_hit_blocks_url_pattern(self, monkeypatch):
"""If slug-guess hits, url_pattern should not be called."""
url_pattern_called: list[bool] = []
slug_result = ATSResult(
ats_name="lever",
careers_url="https://jobs.lever.co/acme",
position_url="https://jobs.lever.co/acme/xyz",
job_count=5,
)
self._patch_for_slug_guess(
monkeypatch, slug_guess_result=slug_result, url_pattern="https://acme.com/careers"
)
# Override url_pattern with a tracker
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.probe_url_patterns",
lambda *a, **kw: url_pattern_called.append(True) or None,
)
find_careers_page("https://www.acme.com", client=FakeClient())
assert url_pattern_called == []
def test_slug_guess_miss_falls_through_to_url_pattern(self, monkeypatch):
"""When slug-guess misses, the cascade continues to url_pattern."""
self._patch_for_slug_guess(
monkeypatch,
slug_guess_result=None,
url_pattern="https://acme.com/careers",
)
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.method == "url_pattern"
assert result.confidence == 0.80
def test_company_name_forwarded_to_slug_guess(self, monkeypatch):
"""company_name must be passed through to recover_via_slug_guess."""
received: list[tuple] = []
def fake_recover(website, company_name, client):
received.append((website, company_name))
return None
monkeypatch.setattr(
"jobsource.careers.cascade._safe_get_html",
lambda *a, **kw: "<html/>",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_and_fetch",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.recover_via_slug_guess",
fake_recover,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.probe_url_patterns",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.scan_homepage_links",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.parse_sitemap",
lambda *a, **kw: None,
)
find_careers_page(
"https://www.anthropic.com",
company_name="Anthropic",
client=FakeClient(),
)
assert len(received) == 1
assert received[0] == ("https://www.anthropic.com", "Anthropic")
def test_slug_guess_tier_error_falls_through(self, monkeypatch):
"""A slug-guess tier that raises should not abort the cascade."""
monkeypatch.setattr(
"jobsource.careers.cascade._safe_get_html",
lambda *a, **kw: "<html/>",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_and_fetch",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.recover_via_slug_guess",
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("slug_guess exploded")),
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.probe_url_patterns",
lambda *a, **kw: "https://acme.com/careers",
)
monkeypatch.setattr(
"jobsource.careers.cascade._ats.detect_ats_in_url",
lambda url: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.scan_homepage_links",
lambda *a, **kw: None,
)
monkeypatch.setattr(
"jobsource.careers.cascade._heuristics.parse_sitemap",
lambda *a, **kw: None,
)
result = find_careers_page("https://www.acme.com", client=FakeClient())
assert result.method == "url_pattern"
assert result.careers_url == "https://acme.com/careers"