553 lines
21 KiB
Python
553 lines
21 KiB
Python
"""Tests for jobsource/careers/cascade.py — all network-free via monkeypatching."""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from jobsource.careers import CareersResult, find_careers_page
|
|
from jobsource.careers.cascade import _detect_ats_in_page, _finalize, _safe_get_html
|
|
from jobsource.careers.ats import ATSBoard, ATSFetch, ATSResult
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fake helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class FakeResponse:
|
|
def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"):
|
|
self.status_code = status_code
|
|
self.text = text
|
|
self.url = url
|
|
|
|
|
|
class FakeClient:
|
|
pass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _detect_ats_in_page
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDetectATSInPage:
|
|
def test_returns_board_from_page_html(self, monkeypatch):
|
|
html = '<a href="https://boards.greenhouse.io/vercel">Jobs</a>'
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(200, html),
|
|
)
|
|
board = _detect_ats_in_page("https://vercel.com/careers", FakeClient())
|
|
assert board is not None
|
|
assert board.ats_name == "greenhouse"
|
|
assert board.slug == "vercel"
|
|
|
|
def test_returns_none_on_404(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(404, ""),
|
|
)
|
|
assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None
|
|
|
|
def test_returns_none_on_exception(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade.request_with_retries",
|
|
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")),
|
|
)
|
|
assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None
|
|
|
|
def test_returns_none_when_no_ats_in_page(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(200, "<html>no ats here</html>"),
|
|
)
|
|
assert _detect_ats_in_page("https://acme.com/careers", FakeClient()) is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _safe_get_html
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSafeGetHtml:
|
|
def test_returns_text_on_200(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(200, "page html"),
|
|
)
|
|
result = _safe_get_html("https://acme.com", FakeClient())
|
|
assert result == "page html"
|
|
|
|
def test_returns_none_on_404(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(404, ""),
|
|
)
|
|
result = _safe_get_html("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_returns_none_on_exception(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade.request_with_retries",
|
|
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("network error")),
|
|
)
|
|
result = _safe_get_html("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# find_careers_page — tier ordering and early return
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestCascadeTierOrdering:
|
|
def _patch_tiers(self, monkeypatch, *, ats=None, slug_guess=None,
|
|
url_pattern=None, homepage=None, sitemap=None, html="<html/>"):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._safe_get_html",
|
|
lambda website, client: html,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_and_fetch",
|
|
lambda *a, **kw: ats,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.recover_via_slug_guess",
|
|
lambda *a, **kw: slug_guess,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.probe_url_patterns",
|
|
lambda *a, **kw: url_pattern,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.scan_homepage_links",
|
|
lambda *a, **kw: homepage,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.parse_sitemap",
|
|
lambda *a, **kw: sitemap,
|
|
)
|
|
# Also stub detect_ats_in_url so _finalize doesn't try to do network calls
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: None,
|
|
)
|
|
|
|
def test_ats_hit_returns_095_confidence(self, monkeypatch):
|
|
ats_result = ATSResult(
|
|
ats_name="greenhouse",
|
|
careers_url="https://boards.greenhouse.io/acme",
|
|
position_url="https://careers.acme.com/positions/1",
|
|
job_count=10,
|
|
)
|
|
self._patch_tiers(monkeypatch, ats=ats_result)
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.careers_url == "https://boards.greenhouse.io/acme"
|
|
assert result.confidence == 0.95
|
|
assert result.method == "ats:greenhouse"
|
|
assert result.ats_name == "greenhouse"
|
|
assert result.position_url == "https://careers.acme.com/positions/1"
|
|
|
|
def test_url_pattern_hit_when_ats_misses(self, monkeypatch):
|
|
self._patch_tiers(monkeypatch, url_pattern="https://acme.com/careers")
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.careers_url == "https://acme.com/careers"
|
|
assert result.confidence == 0.80
|
|
assert result.method == "url_pattern"
|
|
assert result.ats_name is None
|
|
|
|
def test_homepage_scan_hit_when_ats_and_url_pattern_miss(self, monkeypatch):
|
|
self._patch_tiers(monkeypatch, homepage="https://acme.com/careers")
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.careers_url == "https://acme.com/careers"
|
|
assert result.confidence == 0.60
|
|
assert result.method == "homepage_scan"
|
|
|
|
def test_sitemap_hit_when_all_else_misses(self, monkeypatch):
|
|
self._patch_tiers(monkeypatch, sitemap="https://acme.com/careers")
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.careers_url == "https://acme.com/careers"
|
|
assert result.confidence == 0.50
|
|
assert result.method == "sitemap"
|
|
|
|
def test_all_miss_returns_none_method(self, monkeypatch):
|
|
self._patch_tiers(monkeypatch)
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.careers_url is None
|
|
assert result.confidence == 0.0
|
|
assert result.method == "none"
|
|
|
|
def test_ats_hit_skips_later_tiers(self, monkeypatch):
|
|
"""When ATS resolves, slug_guess/url_pattern/homepage/sitemap should not be called."""
|
|
ats_result = ATSResult(
|
|
ats_name="lever", careers_url="https://jobs.lever.co/acme",
|
|
)
|
|
later_called: list[str] = []
|
|
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._safe_get_html",
|
|
lambda *a, **kw: "<html/>",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_and_fetch",
|
|
lambda *a, **kw: ats_result,
|
|
)
|
|
|
|
def make_tracker(name):
|
|
def fn(*a, **kw):
|
|
later_called.append(name)
|
|
return None
|
|
return fn
|
|
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.recover_via_slug_guess",
|
|
make_tracker("slug_guess"),
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.probe_url_patterns",
|
|
make_tracker("url_pattern"),
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.scan_homepage_links",
|
|
make_tracker("homepage_scan"),
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.parse_sitemap",
|
|
make_tracker("sitemap"),
|
|
)
|
|
|
|
find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert later_called == []
|
|
|
|
def test_failing_tier_falls_through(self, monkeypatch):
|
|
"""A tier that raises should not abort the cascade."""
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._safe_get_html",
|
|
lambda *a, **kw: "<html/>",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_and_fetch",
|
|
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("ats exploded")),
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.recover_via_slug_guess",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.probe_url_patterns",
|
|
lambda *a, **kw: "https://acme.com/careers",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.scan_homepage_links",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.parse_sitemap",
|
|
lambda *a, **kw: None,
|
|
)
|
|
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.method == "url_pattern"
|
|
assert result.careers_url == "https://acme.com/careers"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ATS-URL upgrade in _finalize
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestFinalizeATSUpgrade:
|
|
def test_lever_url_upgrades_to_ats_lever(self, monkeypatch):
|
|
"""When a heuristic finds a Lever URL, _finalize upgrades to ats:lever."""
|
|
from jobsource.careers.ats import ATSBoard
|
|
|
|
lever_board = ATSBoard(
|
|
ats_name="lever",
|
|
slug="acme",
|
|
careers_url="https://jobs.lever.co/acme",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: lever_board,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats._FETCH_DISPATCH",
|
|
{"lever": lambda board, client: ATSFetch(first_url="https://jobs.lever.co/acme/abc", job_count=5)},
|
|
)
|
|
|
|
result = _finalize(
|
|
"https://jobs.lever.co/acme", "homepage_scan", 0.60,
|
|
"https://www.acme.com", FakeClient(),
|
|
)
|
|
assert result.method == "ats:lever"
|
|
assert result.confidence == 0.95
|
|
assert result.position_url == "https://jobs.lever.co/acme/abc"
|
|
assert result.ats_name == "lever"
|
|
|
|
def test_non_ats_url_no_upgrade(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: None,
|
|
)
|
|
result = _finalize(
|
|
"https://acme.com/careers", "url_pattern", 0.80,
|
|
"https://www.acme.com", FakeClient(),
|
|
)
|
|
assert result.method == "url_pattern"
|
|
assert result.confidence == 0.80
|
|
assert result.position_url is None
|
|
|
|
def test_ats_upgrade_from_page_html(self, monkeypatch):
|
|
"""URL pattern finds /careers; fetching that page reveals Greenhouse embed → upgrade."""
|
|
gh_board = ATSBoard(
|
|
ats_name="greenhouse",
|
|
slug="vercel",
|
|
careers_url="https://boards.greenhouse.io/vercel",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: None, # URL string itself is not an ATS URL
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._detect_ats_in_page",
|
|
lambda url, client: gh_board, # page HTML reveals Greenhouse
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats._FETCH_DISPATCH",
|
|
{"greenhouse": lambda board, client: ATSFetch(first_url="https://job-boards.greenhouse.io/vercel/jobs/123", job_count=73)},
|
|
)
|
|
result = _finalize(
|
|
"https://vercel.com/careers", "url_pattern", 0.80,
|
|
"https://vercel.com", FakeClient(),
|
|
)
|
|
assert result.method == "ats:greenhouse"
|
|
assert result.confidence == 0.95
|
|
assert result.careers_url == "https://boards.greenhouse.io/vercel"
|
|
assert result.position_url == "https://job-boards.greenhouse.io/vercel/jobs/123"
|
|
assert result.ats_name == "greenhouse"
|
|
|
|
def test_ats_upgrade_fetch_failure_falls_back_to_original(self, monkeypatch):
|
|
"""If ATS fetch during upgrade fails, return the original heuristic result."""
|
|
from jobsource.careers.ats import ATSBoard
|
|
|
|
gh_board = ATSBoard(
|
|
ats_name="greenhouse",
|
|
slug="acme",
|
|
careers_url="https://boards.greenhouse.io/acme",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: gh_board,
|
|
)
|
|
|
|
def boom(board, client):
|
|
raise RuntimeError("api down")
|
|
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats._FETCH_DISPATCH",
|
|
{"greenhouse": boom},
|
|
)
|
|
result = _finalize(
|
|
"https://boards.greenhouse.io/acme", "sitemap", 0.50,
|
|
"https://www.acme.com", FakeClient(),
|
|
)
|
|
# Upgrade failed — returns the original heuristic result
|
|
assert result.method == "sitemap"
|
|
assert result.confidence == 0.50
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CareersResult model
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestCareersResultModel:
|
|
def test_defaults(self):
|
|
r = CareersResult()
|
|
assert r.careers_url is None
|
|
assert r.confidence == 0.0
|
|
assert r.method == "none"
|
|
assert r.ats_name is None
|
|
assert r.position_url is None
|
|
|
|
def test_full(self):
|
|
r = CareersResult(
|
|
careers_url="https://boards.greenhouse.io/acme",
|
|
confidence=0.95,
|
|
method="ats:greenhouse",
|
|
ats_name="greenhouse",
|
|
position_url="https://careers.acme.com/positions/1",
|
|
)
|
|
assert r.careers_url == "https://boards.greenhouse.io/acme"
|
|
assert r.ats_name == "greenhouse"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tier 1b — slug-guess in cascade ordering
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSlugGuessTier:
|
|
"""Tests that slug-guess wires correctly into the cascade between HTML-ATS and url_pattern."""
|
|
|
|
def _patch_for_slug_guess(self, monkeypatch, *, slug_guess_result=None,
|
|
url_pattern=None):
|
|
"""Patch cascade so HTML-ATS always misses; control slug_guess and url_pattern."""
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._safe_get_html",
|
|
lambda website, client: "<html/>",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_and_fetch",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.recover_via_slug_guess",
|
|
lambda *a, **kw: slug_guess_result,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.probe_url_patterns",
|
|
lambda *a, **kw: url_pattern,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.scan_homepage_links",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.parse_sitemap",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: None,
|
|
)
|
|
|
|
def test_slug_guess_hit_returns_090_confidence(self, monkeypatch):
|
|
"""A slug-guess hit should produce confidence=0.90 and method=ats:{name}:slug_guess."""
|
|
slug_result = ATSResult(
|
|
ats_name="greenhouse",
|
|
careers_url="https://boards.greenhouse.io/anthropic",
|
|
position_url="https://boards.greenhouse.io/anthropic/jobs/1",
|
|
job_count=370,
|
|
)
|
|
self._patch_for_slug_guess(monkeypatch, slug_guess_result=slug_result)
|
|
result = find_careers_page(
|
|
"https://www.anthropic.com",
|
|
company_name="Anthropic",
|
|
client=FakeClient(),
|
|
)
|
|
assert result.confidence == 0.90
|
|
assert result.method == "ats:greenhouse:slug_guess"
|
|
assert result.careers_url == "https://boards.greenhouse.io/anthropic"
|
|
assert result.ats_name == "greenhouse"
|
|
assert result.position_url == "https://boards.greenhouse.io/anthropic/jobs/1"
|
|
|
|
def test_slug_guess_hit_blocks_url_pattern(self, monkeypatch):
|
|
"""If slug-guess hits, url_pattern should not be called."""
|
|
url_pattern_called: list[bool] = []
|
|
slug_result = ATSResult(
|
|
ats_name="lever",
|
|
careers_url="https://jobs.lever.co/acme",
|
|
position_url="https://jobs.lever.co/acme/xyz",
|
|
job_count=5,
|
|
)
|
|
self._patch_for_slug_guess(
|
|
monkeypatch, slug_guess_result=slug_result, url_pattern="https://acme.com/careers"
|
|
)
|
|
# Override url_pattern with a tracker
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.probe_url_patterns",
|
|
lambda *a, **kw: url_pattern_called.append(True) or None,
|
|
)
|
|
find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert url_pattern_called == []
|
|
|
|
def test_slug_guess_miss_falls_through_to_url_pattern(self, monkeypatch):
|
|
"""When slug-guess misses, the cascade continues to url_pattern."""
|
|
self._patch_for_slug_guess(
|
|
monkeypatch,
|
|
slug_guess_result=None,
|
|
url_pattern="https://acme.com/careers",
|
|
)
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.method == "url_pattern"
|
|
assert result.confidence == 0.80
|
|
|
|
def test_company_name_forwarded_to_slug_guess(self, monkeypatch):
|
|
"""company_name must be passed through to recover_via_slug_guess."""
|
|
received: list[tuple] = []
|
|
|
|
def fake_recover(website, company_name, client):
|
|
received.append((website, company_name))
|
|
return None
|
|
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._safe_get_html",
|
|
lambda *a, **kw: "<html/>",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_and_fetch",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.recover_via_slug_guess",
|
|
fake_recover,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.probe_url_patterns",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.scan_homepage_links",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.parse_sitemap",
|
|
lambda *a, **kw: None,
|
|
)
|
|
|
|
find_careers_page(
|
|
"https://www.anthropic.com",
|
|
company_name="Anthropic",
|
|
client=FakeClient(),
|
|
)
|
|
assert len(received) == 1
|
|
assert received[0] == ("https://www.anthropic.com", "Anthropic")
|
|
|
|
def test_slug_guess_tier_error_falls_through(self, monkeypatch):
|
|
"""A slug-guess tier that raises should not abort the cascade."""
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._safe_get_html",
|
|
lambda *a, **kw: "<html/>",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_and_fetch",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.recover_via_slug_guess",
|
|
lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("slug_guess exploded")),
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.probe_url_patterns",
|
|
lambda *a, **kw: "https://acme.com/careers",
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._ats.detect_ats_in_url",
|
|
lambda url: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.scan_homepage_links",
|
|
lambda *a, **kw: None,
|
|
)
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.cascade._heuristics.parse_sitemap",
|
|
lambda *a, **kw: None,
|
|
)
|
|
result = find_careers_page("https://www.acme.com", client=FakeClient())
|
|
assert result.method == "url_pattern"
|
|
assert result.careers_url == "https://acme.com/careers"
|