"""Tests for jobsource/careers/ats.py — all network-free via monkeypatching.""" from __future__ import annotations import pytest from jobsource.careers.ats import ( ATSBoard, ATSFetch, ATSResult, _board_from_slug, _domain_stem, _fetch_ashby, _fetch_greenhouse, _fetch_lever, _fetch_workday, _loose_name_match, _slug_candidates, detect_and_fetch, detect_ats_in_html, detect_ats_in_url, recover_via_slug_guess, ) # --------------------------------------------------------------------------- # Tiny fake HTTP response for monkeypatching request_with_retries # --------------------------------------------------------------------------- class FakeResponse: def __init__(self, status_code: int, body: object, url: str = "https://example.com"): self.status_code = status_code self._body = body self.url = url self.text = str(body) def json(self) -> object: return self._body class FakeClient: """Stands in for httpx.Client; never actually used in network calls here.""" # --------------------------------------------------------------------------- # detect_ats_in_html — Greenhouse # --------------------------------------------------------------------------- class TestDetectATSInHtmlGreenhouse: def test_boards_greenhouse_script_tag(self): html = '' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "greenhouse" assert board.slug == "airbnb" assert board.careers_url == "https://boards.greenhouse.io/airbnb" def test_boards_greenhouse_direct_link(self): html = 'Jobs' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "greenhouse" assert board.slug == "acme" def test_job_boards_subdomain(self): html = 'Jobs' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "greenhouse" assert board.slug == "stripe" def test_no_match_returns_none(self): assert detect_ats_in_html("
Nothing here") is None # --------------------------------------------------------------------------- # detect_ats_in_html — Lever # --------------------------------------------------------------------------- class TestDetectATSInHtmlLever: def test_jobs_lever_link(self): html = 'Open roles' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "lever" assert board.slug == "leverdemo" assert board.careers_url == "https://jobs.lever.co/leverdemo" def test_lever_embed_script(self): html = 'var lever = "jobs.lever.co/acme-corp";' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "lever" assert board.slug == "acme-corp" # --------------------------------------------------------------------------- # detect_ats_in_html — Ashby # --------------------------------------------------------------------------- class TestDetectATSInHtmlAshby: def test_jobs_ashbyhq_link(self): html = 'Careers' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "ashby" assert board.slug == "Ramp" assert board.careers_url == "https://jobs.ashbyhq.com/Ramp" def test_lowercase_slug(self): html = 'Join us' board = detect_ats_in_html(html) assert board is not None assert board.slug == "linear" # --------------------------------------------------------------------------- # detect_ats_in_html — Workday # --------------------------------------------------------------------------- class TestDetectATSInHtmlWorkday: def test_myworkdayjobs_link(self): html = 'Jobs' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "workday" assert board.wd_host == "nvidia.wd5.myworkdayjobs.com" assert board.wd_tenant == "nvidia" assert board.wd_site == "NVIDIAExternalCareerSite" assert "en-US" in board.careers_url assert board.careers_url == "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite" def test_workday_without_locale(self): html = 'Careers' board = detect_ats_in_html(html) assert board is not None assert board.ats_name == "workday" assert board.wd_site == "AcmeCareers" def test_workday_missing_site_returns_none(self): # Just the host with no path — can't form a board html = 'https://acme.wd1.myworkdayjobs.com' board = detect_ats_in_html(html) assert board is None # --------------------------------------------------------------------------- # detect_ats_in_url # --------------------------------------------------------------------------- class TestDetectATSInUrl: def test_greenhouse_url(self): board = detect_ats_in_url("https://boards.greenhouse.io/stripe") assert board is not None assert board.ats_name == "greenhouse" assert board.slug == "stripe" def test_lever_url(self): board = detect_ats_in_url("https://jobs.lever.co/leverdemo") assert board is not None assert board.ats_name == "lever" def test_ashby_url(self): board = detect_ats_in_url("https://jobs.ashbyhq.com/linear") assert board is not None assert board.ats_name == "ashby" def test_workday_url(self): board = detect_ats_in_url( "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite" ) assert board is not None assert board.ats_name == "workday" def test_non_ats_url_returns_none(self): assert detect_ats_in_url("https://www.acme.com/careers") is None # --------------------------------------------------------------------------- # _fetch_greenhouse # --------------------------------------------------------------------------- class TestFetchGreenhouse: def _board(self, slug: str = "airbnb") -> ATSBoard: return ATSBoard( ats_name="greenhouse", slug=slug, careers_url=f"https://boards.greenhouse.io/{slug}", ) def test_extracts_absolute_url(self, monkeypatch): fake_body = { "jobs": [{"absolute_url": "https://careers.airbnb.com/positions/123", "company_name": "Airbnb"}], "meta": {"total": 42}, } monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda client, method, url, **kw: FakeResponse(200, fake_body), ) fetch = _fetch_greenhouse(self._board(), FakeClient()) assert fetch.first_url == "https://careers.airbnb.com/positions/123" assert fetch.job_count == 42 assert fetch.org_name == "Airbnb" def test_empty_jobs_list(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, {"jobs": [], "meta": {"total": 0}}), ) fetch = _fetch_greenhouse(self._board(), FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 assert fetch.org_name is None def test_non_200_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(404, {}), ) fetch = _fetch_greenhouse(self._board(), FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 def test_network_error_returns_none(self, monkeypatch): def boom(*a, **kw): raise RuntimeError("network down") monkeypatch.setattr("jobsource.careers.ats.request_with_retries", boom) fetch = _fetch_greenhouse(self._board(), FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 # --------------------------------------------------------------------------- # _fetch_lever # --------------------------------------------------------------------------- class TestFetchLever: def _board(self, slug: str = "leverdemo") -> ATSBoard: return ATSBoard( ats_name="lever", slug=slug, careers_url=f"https://jobs.lever.co/{slug}", ) def test_extracts_hosted_url(self, monkeypatch): fake_body = [{"hostedUrl": "https://jobs.lever.co/leverdemo/abc-123"}] monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, fake_body), ) fetch = _fetch_lever(self._board(), FakeClient()) assert fetch.first_url == "https://jobs.lever.co/leverdemo/abc-123" assert fetch.job_count == 1 def test_empty_list(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, []), ) fetch = _fetch_lever(self._board(), FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 def test_non_list_response(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, {"error": "not found"}), ) fetch = _fetch_lever(self._board(), FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 def test_non_200_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(404, []), ) fetch = _fetch_lever(self._board(), FakeClient()) assert fetch.first_url is None # --------------------------------------------------------------------------- # _fetch_ashby # --------------------------------------------------------------------------- class TestFetchAshby: def _board(self, slug: str = "Ramp") -> ATSBoard: return ATSBoard( ats_name="ashby", slug=slug, careers_url=f"https://jobs.ashbyhq.com/{slug}", ) def test_extracts_job_url(self, monkeypatch): fake_body = { "jobs": [{"jobUrl": "https://jobs.ashbyhq.com/Ramp/abc-def"}], "apiVersion": "1", } monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, fake_body), ) fetch = _fetch_ashby(self._board(), FakeClient()) assert fetch.first_url == "https://jobs.ashbyhq.com/Ramp/abc-def" assert fetch.job_count == 1 def test_empty_jobs(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, {"jobs": []}), ) fetch = _fetch_ashby(self._board(), FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 def test_network_error_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")), ) fetch = _fetch_ashby(self._board(), FakeClient()) assert fetch.first_url is None # --------------------------------------------------------------------------- # _fetch_workday # --------------------------------------------------------------------------- class TestFetchWorkday: def _board(self) -> ATSBoard: return ATSBoard( ats_name="workday", slug="nvidia/NVIDIAExternalCareerSite", careers_url="https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite", wd_host="nvidia.wd5.myworkdayjobs.com", wd_tenant="nvidia", wd_site="NVIDIAExternalCareerSite", ) def test_builds_full_job_url(self, monkeypatch): fake_body = { "total": 2000, "jobPostings": [{"externalPath": "/job/US/SWE_JR123"}], } monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, fake_body), ) fetch = _fetch_workday(self._board(), FakeClient()) assert fetch.first_url == "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite/job/US/SWE_JR123" assert fetch.job_count == 2000 def test_empty_postings(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, {"total": 0, "jobPostings": []}), ) fetch = _fetch_workday(self._board(), FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 def test_missing_wd_coords_returns_none(self): board = ATSBoard( ats_name="workday", slug="x", careers_url="https://x.wd1.myworkdayjobs.com" ) fetch = _fetch_workday(board, FakeClient()) assert fetch.first_url is None assert fetch.job_count == 0 def test_non_200_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(403, {}), ) fetch = _fetch_workday(self._board(), FakeClient()) assert fetch.first_url is None # --------------------------------------------------------------------------- # detect_and_fetch orchestration # --------------------------------------------------------------------------- class TestDetectAndFetch: def test_greenhouse_full_flow(self, monkeypatch): html = 'Jobs' job_resp = { "jobs": [{"absolute_url": "https://careers.airbnb.com/positions/1"}], "meta": {"total": 5}, } monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(200, job_resp), ) result = detect_and_fetch("https://www.airbnb.com", FakeClient(), homepage_html=html) assert result is not None assert result.ats_name == "greenhouse" assert result.careers_url == "https://boards.greenhouse.io/airbnb" assert result.position_url == "https://careers.airbnb.com/positions/1" assert result.job_count == 5 def test_no_ats_returns_none(self, monkeypatch): html = "No ATS here" result = detect_and_fetch("https://www.example.com", FakeClient(), homepage_html=html) assert result is None def test_api_failure_returns_result_without_position_url(self, monkeypatch): html = 'Jobs' monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(500, []), ) result = detect_and_fetch("https://www.acme.com", FakeClient(), homepage_html=html) assert result is not None assert result.ats_name == "lever" assert result.careers_url == "https://jobs.lever.co/acme" assert result.position_url is None def test_homepage_fetch_failure_returns_none(self, monkeypatch): """When homepage_html is None and the fetch fails, return None.""" def boom(*a, **kw): raise RuntimeError("connection refused") monkeypatch.setattr("jobsource.careers.ats.request_with_retries", boom) result = detect_and_fetch("https://www.example.com", FakeClient()) assert result is None def test_uses_provided_html_without_fetching(self, monkeypatch): """If homepage_html is provided, request_with_retries is only called for the API.""" html = 'Jobs' calls: list[str] = [] job_resp = {"jobs": [{"jobUrl": "https://jobs.ashbyhq.com/linear/xyz"}]} def fake_req(client, method, url, **kw): calls.append(url) return FakeResponse(200, job_resp) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = detect_and_fetch("https://www.linear.app", FakeClient(), homepage_html=html) assert result is not None # Only one call: the API fetch (not the homepage) assert len(calls) == 1 assert "ashby" in calls[0] # --------------------------------------------------------------------------- # Pure-unit helpers # --------------------------------------------------------------------------- class TestDomainStem: def test_strips_www(self): assert _domain_stem("https://www.anthropic.com") == "anthropic" def test_no_www(self): assert _domain_stem("https://linear.app") == "linear" def test_with_path(self): assert _domain_stem("https://www.figma.com/careers/") == "figma" def test_invalid_returns_none(self): assert _domain_stem("") is None or isinstance(_domain_stem(""), (str, type(None))) class TestSlugCandidates: def test_domain_stem_first(self): candidates = _slug_candidates("https://www.anthropic.com", "Anthropic") assert candidates[0] == "anthropic" def test_deduplicates_stem_and_name(self): # stem == normalized name → only one entry candidates = _slug_candidates("https://www.anthropic.com", "Anthropic") assert candidates.count("anthropic") == 1 def test_different_stem_and_name(self): # stem differs from normalized name → both appear candidates = _slug_candidates("https://www.acmecorp.com", "Acme Corp Inc") assert "acmecorp" in candidates assert "acmecorp" in candidates or "acmecorp" in candidates # normalized name strips "Inc" → "acmecorp" def test_name_only_candidate_when_stem_equal(self): # When stem and slug match, only one entry candidates = _slug_candidates("https://ramp.com", "Ramp") assert len(candidates) == 1 assert candidates[0] == "ramp" def test_no_company_name_uses_stem_only(self): candidates = _slug_candidates("https://www.anthropic.com", None) assert candidates == ["anthropic"] def test_max_three_candidates(self): # Can't produce more than 3 candidates = _slug_candidates("https://www.x.com", "X Corp Inc") assert len(candidates) <= 3 class TestLooseNameMatch: def test_exact_match(self): assert _loose_name_match("Anthropic", "Anthropic") is True def test_one_substring_of_other(self): assert _loose_name_match("Acme", "Acme Corp Inc") is True def test_clear_mismatch(self): assert _loose_name_match("Acme", "Globex") is False def test_empty_input_returns_true(self): assert _loose_name_match("", "Acme") is True def test_empty_org_returns_true(self): assert _loose_name_match("Acme", "") is True def test_case_insensitive(self): assert _loose_name_match("ANTHROPIC", "anthropic") is True # --------------------------------------------------------------------------- # recover_via_slug_guess # --------------------------------------------------------------------------- class TestRecoverViaSlugGuess: """All tests drive the real _fetch_* via a URL-dispatching fake request_with_retries.""" def _gh_resp(self, slug: str, company_name: str, count: int = 5) -> dict: """Canned Greenhouse response with jobs.""" return { "jobs": [{"absolute_url": f"https://boards.greenhouse.io/{slug}/jobs/1", "company_name": company_name}], "meta": {"total": count}, } def _gh_empty(self) -> dict: return {"jobs": [], "meta": {"total": 0}} def _lever_resp(self, slug: str, count: int = 3) -> list: return [{"hostedUrl": f"https://jobs.lever.co/{slug}/abc"}] * count def _ashby_resp(self, slug: str, count: int = 2) -> dict: return {"jobs": [{"jobUrl": f"https://jobs.ashbyhq.com/{slug}/xyz"}] * count} def _ashby_empty(self) -> dict: return {"jobs": []} def _lever_empty(self) -> list: return [] # ----- Domain-stem hit (Anthropic-style) ----- def test_domain_stem_greenhouse_hit(self, monkeypatch): """Greenhouse slug derived from domain stem → returns ATSResult.""" def fake_req(client, method, url, **kw): if "boards-api.greenhouse.io/v1/boards/anthropic" in url: return FakeResponse(200, self._gh_resp("anthropic", "Anthropic", count=370)) # All other probes empty if "lever.co" in url: return FakeResponse(200, self._lever_empty()) if "ashby" in url: return FakeResponse(200, self._ashby_empty()) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = recover_via_slug_guess( "https://www.anthropic.com", "Anthropic", FakeClient() ) assert result is not None assert result.ats_name == "greenhouse" assert result.careers_url == "https://boards.greenhouse.io/anthropic" assert result.position_url is not None assert result.job_count == 370 # ----- Name-candidate fallback when stem misses ----- def test_name_candidate_fallback(self, monkeypatch): """Stem misses; _slug(company_name) slug hits on Lever.""" def fake_req(client, method, url, **kw): # Stem slug "acmecorp" → greenhouse empty, lever empty, ashby empty if "acmecorp" in url and "boards-api" in url: return FakeResponse(200, self._gh_empty()) if "acmecorp" in url and "lever" in url: return FakeResponse(200, self._lever_empty()) if "acmecorp" in url and "ashby" in url: return FakeResponse(200, self._ashby_empty()) # Name slug "acme" → lever hit if "acme" in url and "lever" in url: return FakeResponse(200, self._lever_resp("acme")) if "acme" in url and "boards-api" in url: return FakeResponse(200, self._gh_empty()) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = recover_via_slug_guess( "https://www.acmecorp.com", "Acme", FakeClient() ) assert result is not None assert result.ats_name == "lever" assert "acme" in result.careers_url # ----- 0-jobs reject ----- def test_zero_jobs_rejected(self, monkeypatch): """All slugs resolve but job_count==0 everywhere → None.""" def fake_req(client, method, url, **kw): if "boards-api" in url: return FakeResponse(200, self._gh_empty()) if "lever" in url: return FakeResponse(200, self._lever_empty()) if "ashby" in url: return FakeResponse(200, self._ashby_empty()) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = recover_via_slug_guess( "https://www.acme.com", "Acme", FakeClient() ) assert result is None # ----- Org-name mismatch reject (collision guard) ----- def test_org_name_mismatch_rejected(self, monkeypatch): """Greenhouse returns jobs but org_name is a different company → skip to lever/ashby → miss.""" def fake_req(client, method, url, **kw): if "boards-api" in url: # Returns jobs but for wrong company return FakeResponse(200, self._gh_resp("acme", "Globex Corporation", count=10)) if "lever" in url: return FakeResponse(200, self._lever_empty()) if "ashby" in url: return FakeResponse(200, self._ashby_empty()) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = recover_via_slug_guess( "https://www.acme.com", "Acme Corp", FakeClient() ) assert result is None # ----- Lever hit (no org_name) accepted on job_count alone ----- def test_lever_hit_without_org_name_accepted(self, monkeypatch): """Lever doesn't expose org_name → cross-check is skipped; job_count>0 wins.""" def fake_req(client, method, url, **kw): if "boards-api" in url: return FakeResponse(200, self._gh_empty()) if "lever" in url: return FakeResponse(200, self._lever_resp("acme")) if "ashby" in url: return FakeResponse(200, self._ashby_empty()) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = recover_via_slug_guess( "https://www.acme.com", "Completely Different Name", FakeClient() ) # Lever returns jobs, no org_name → cross-check skipped → accepted assert result is not None assert result.ats_name == "lever" # ----- Short-circuit: greenhouse hit stops remaining probes ----- def test_short_circuits_on_first_hit(self, monkeypatch): """Once Greenhouse hits, Lever and Ashby are NOT probed.""" probed: list[str] = [] def fake_req(client, method, url, **kw): probed.append(url) if "boards-api.greenhouse.io/v1/boards/acme" in url: return FakeResponse(200, self._gh_resp("acme", "Acme", count=5)) if "lever" in url: return FakeResponse(200, self._lever_resp("acme")) if "ashby" in url: return FakeResponse(200, self._ashby_resp("acme")) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = recover_via_slug_guess( "https://www.acme.com", "Acme", FakeClient() ) assert result is not None assert result.ats_name == "greenhouse" # Lever and Ashby URLs must not have been probed assert not any("lever" in u for u in probed) assert not any("ashby" in u for u in probed) # ----- company_name=None: stem-only, cross-check skipped ----- def test_no_company_name_uses_stem_and_skips_crosscheck(self, monkeypatch): """With company_name=None, use domain stem only; org_name cross-check skipped.""" def fake_req(client, method, url, **kw): if "boards-api.greenhouse.io/v1/boards/acme" in url: return FakeResponse(200, self._gh_resp("acme", "Some Other Company")) if "lever" in url: return FakeResponse(200, self._lever_empty()) if "ashby" in url: return FakeResponse(200, self._ashby_empty()) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) # company_name=None → cross-check disabled even if org_name differs result = recover_via_slug_guess( "https://www.acme.com", None, FakeClient() ) assert result is not None assert result.ats_name == "greenhouse" # ----- All-miss ----- def test_all_miss_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.ats.request_with_retries", lambda *a, **kw: FakeResponse(404, {}), ) result = recover_via_slug_guess( "https://www.nobody.com", "Nobody Inc", FakeClient() ) assert result is None # ----- Network error on one probe falls through ----- def test_single_probe_error_falls_through(self, monkeypatch): """A probe that raises should not abort recovery; others are still tried.""" call_count = [0] def fake_req(client, method, url, **kw): call_count[0] += 1 if "boards-api" in url: raise RuntimeError("greenhouse down") if "lever" in url: return FakeResponse(200, self._lever_resp("acme")) if "ashby" in url: return FakeResponse(200, self._ashby_empty()) return FakeResponse(404, {}) monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) result = recover_via_slug_guess( "https://www.acme.com", None, FakeClient() ) assert result is not None assert result.ats_name == "lever"