"""Tests for jobsource/careers/heuristics.py — all network-free.""" from __future__ import annotations import pytest from jobsource.careers.heuristics import ( _base_parts, _is_plausible_careers_url, _score_anchor, parse_sitemap, probe_url_patterns, scan_homepage_links, ) # --------------------------------------------------------------------------- # Tiny fake response helper # --------------------------------------------------------------------------- class FakeResponse: def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"): self.status_code = status_code self.text = text self.url = url def json(self) -> object: return {} class FakeClient: pass # --------------------------------------------------------------------------- # _is_plausible_careers_url # --------------------------------------------------------------------------- class TestIsPlausibleCareersUrl: def test_same_domain_clean_path(self): assert _is_plausible_careers_url("https://netflix.com/careers", "https://netflix.com/careers") is True def test_same_domain_deeper_path(self): # Google /careers → /about/careers/applications/ assert _is_plausible_careers_url( "https://google.com/careers", "https://google.com/about/careers/applications/", ) is True def test_soft_404_notfound_rejected(self): # Netflix SPA: /careers → /NotFound?prev=... assert _is_plausible_careers_url( "https://www.netflix.com/careers", "https://www.netflix.com/NotFound?prev=https%3A%2F%2Fwww.netflix.com%2Fcareers", ) is False def test_soft_404_slash404_rejected(self): assert _is_plausible_careers_url( "https://example.com/jobs", "https://example.com/404", ) is False def test_soft_404_not_found_hyphen_rejected(self): assert _is_plausible_careers_url( "https://example.com/jobs", "https://example.com/not-found", ) is False def test_off_brand_cross_domain_rejected(self): # Microsoft /careers → bing.com assert _is_plausible_careers_url( "https://www.microsoft.com/careers", "https://www.bing.com?ref=aka&shorturl=abc", ) is False def test_on_brand_cross_domain_accepted(self): # Amazon /careers → amazon.jobs assert _is_plausible_careers_url( "https://www.amazon.com/careers", "https://amazon.jobs/en/", ) is True def test_workday_subdomain_accepted(self): # nvidia.com/careers → nvidia.wd5.myworkdayjobs.com/... assert _is_plausible_careers_url( "https://www.nvidia.com/careers", "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite", ) is True def test_short_brand_not_filtered(self): # brand <= 3 chars: skip the off-brand check assert _is_plausible_careers_url( "https://ibm.com/careers", "https://jobs.example.com/", ) is True # "ibm" len=3 → no cross-domain filter applied # --------------------------------------------------------------------------- # _base_parts # --------------------------------------------------------------------------- class TestBaseParts: def test_strips_www(self): scheme, host, root = _base_parts("https://www.acme.com/about") assert scheme == "https" assert host == "www.acme.com" assert root == "acme.com" def test_no_www(self): _, _, root = _base_parts("https://acme.com") assert root == "acme.com" def test_subdomain_preserved_in_host(self): _, host, root = _base_parts("https://careers.acme.com/jobs") assert host == "careers.acme.com" assert root == "careers.acme.com" # www. stripping only def test_http_scheme(self): scheme, _, _ = _base_parts("http://acme.com") assert scheme == "http" # --------------------------------------------------------------------------- # _score_anchor # --------------------------------------------------------------------------- class TestScoreAnchor: def test_careers_href_high_score(self): assert _score_anchor("/careers", "") > 2.0 def test_jobs_href_score(self): assert _score_anchor("/jobs", "") > 0 def test_text_careers_adds_score(self): score_with = _score_anchor("/x", "Careers") score_without = _score_anchor("/x", "About") assert score_with > score_without def test_unrelated_href_and_text_zero(self): assert _score_anchor("/about", "About us") == 0.0 def test_combined_href_and_text(self): combined = _score_anchor("/careers", "Careers") href_only = _score_anchor("/careers", "") assert combined > href_only # --------------------------------------------------------------------------- # probe_url_patterns # --------------------------------------------------------------------------- class TestProbeUrlPatterns: def test_returns_careers_path_on_hit(self, monkeypatch): def fake_probe(client, url): if url.endswith("/careers"): return "https://acme.com/careers" return None monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) result = probe_url_patterns("https://acme.com", FakeClient()) assert result == "https://acme.com/careers" def test_returns_none_when_all_miss(self, monkeypatch): monkeypatch.setattr("jobsource.careers.heuristics.probe_url", lambda c, u: None) result = probe_url_patterns("https://acme.com", FakeClient()) assert result is None def test_returns_first_hit_not_second(self, monkeypatch): hits = [] def fake_probe(client, url): hits.append(url) if url.endswith("/career"): return "https://acme.com/career" return None monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) result = probe_url_patterns("https://acme.com", FakeClient()) assert result == "https://acme.com/career" # /careers was probed first and missed; /career was the first hit assert any("/careers" in u for u in hits) def test_careers_subdomain_candidate_included(self, monkeypatch): probed: list[str] = [] def fake_probe(client, url): probed.append(url) return None monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) probe_url_patterns("https://www.acme.com", FakeClient()) assert any("careers.acme.com" in u for u in probed) assert any("jobs.acme.com" in u for u in probed) def test_jobs_subdomain_hit(self, monkeypatch): def fake_probe(client, url): if "jobs.acme.com" in url: return "https://jobs.acme.com" return None monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) result = probe_url_patterns("https://www.acme.com", FakeClient()) assert result == "https://jobs.acme.com" def test_soft_404_rejected_falls_through_to_none(self, monkeypatch): # All candidates redirect to a NotFound page — should return None monkeypatch.setattr( "jobsource.careers.heuristics.probe_url", lambda c, u: u.replace("/careers", "/NotFound").replace("/career", "/NotFound") .replace("/jobs", "/NotFound").replace("/join-us", "/NotFound") .replace("/join", "/NotFound"), ) result = probe_url_patterns("https://www.netflix.com", FakeClient()) assert result is None def test_off_brand_redirect_rejected(self, monkeypatch): # /careers redirects to an off-brand domain → skip; later candidate hits def fake_probe(client, url): if url.endswith("/careers"): return "https://www.bing.com?ref=aka" if url.endswith("/jobs"): return "https://acme.com/jobs" return None monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) result = probe_url_patterns("https://www.acme.com", FakeClient()) assert result == "https://acme.com/jobs" # --------------------------------------------------------------------------- # scan_homepage_links # --------------------------------------------------------------------------- HOMEPAGE_WITH_CAREERS = """ """ HOMEPAGE_NO_CAREER_LINKS = """ """ HOMEPAGE_RELATIVE_LINK = """ Open Jobs """ HOMEPAGE_ABSOLUTE_LEVER = """ Work with us """ class TestScanHomepageLinks: def test_finds_careers_link(self): result = scan_homepage_links( "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_WITH_CAREERS ) assert result is not None assert "careers" in result def test_no_career_links_returns_none(self): result = scan_homepage_links( "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_NO_CAREER_LINKS ) assert result is None def test_relative_href_resolved(self): result = scan_homepage_links( "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_RELATIVE_LINK ) # "jobs" href + "Open Jobs" text should score above threshold assert result is not None assert result.startswith("https://acme.com") def test_absolute_external_link_preserved(self): result = scan_homepage_links( "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_ABSOLUTE_LEVER ) assert result == "https://jobs.lever.co/acme" def test_empty_html_returns_none(self): result = scan_homepage_links( "https://acme.com", FakeClient(), homepage_html="" ) assert result is None def test_fetch_failure_returns_none(self, monkeypatch): def boom(*a, **kw): raise RuntimeError("connection refused") monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom) result = scan_homepage_links("https://acme.com", FakeClient()) assert result is None def test_http_error_status_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.heuristics.request_with_retries", lambda *a, **kw: FakeResponse(500, "error"), ) result = scan_homepage_links("https://acme.com", FakeClient()) assert result is None def test_skips_mailto_links(self): html = 'Email us' result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html) assert result is None def test_skips_fragment_only_links(self): html = 'Careers' result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html) assert result is None def test_prefers_href_careers_over_unrelated_text(self): # /careers in href scores; /about with neutral text should score lower. html = """ Company information Some random text """ result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html) assert result is not None assert "careers" in result # --------------------------------------------------------------------------- # parse_sitemap # --------------------------------------------------------------------------- SIMPLE_SITEMAP = """ https://acme.com/about https://acme.com/careers https://acme.com/blog """ SITEMAP_NO_CAREERS = """ https://acme.com/about https://acme.com/blog """ SITEMAP_INDEX = """ https://acme.com/sitemap-pages.xml https://acme.com/sitemap-jobs.xml """ SITEMAP_JOBS_CHILD = """ https://acme.com/jobs/senior-engineer """ SITEMAP_PAGES_CHILD = """ https://acme.com/about """ class TestParseSitemap: def test_finds_careers_url(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.heuristics.request_with_retries", lambda *a, **kw: FakeResponse(200, SIMPLE_SITEMAP, "https://acme.com/sitemap.xml"), ) result = parse_sitemap("https://acme.com", FakeClient()) assert result == "https://acme.com/careers" def test_no_careers_url_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.heuristics.request_with_retries", lambda *a, **kw: FakeResponse(200, SITEMAP_NO_CAREERS), ) result = parse_sitemap("https://acme.com", FakeClient()) assert result is None def test_404_returns_none(self, monkeypatch): monkeypatch.setattr( "jobsource.careers.heuristics.request_with_retries", lambda *a, **kw: FakeResponse(404, ""), ) result = parse_sitemap("https://acme.com", FakeClient()) assert result is None def test_network_error_returns_none(self, monkeypatch): def boom(*a, **kw): raise RuntimeError("timeout") monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom) result = parse_sitemap("https://acme.com", FakeClient()) assert result is None def test_sitemap_index_fetches_children(self, monkeypatch): responses = { "https://acme.com/sitemap.xml": FakeResponse(200, SITEMAP_INDEX), "https://acme.com/sitemap-pages.xml": FakeResponse(200, SITEMAP_PAGES_CHILD), "https://acme.com/sitemap-jobs.xml": FakeResponse(200, SITEMAP_JOBS_CHILD), } def fake_req(client, method, url, **kw): # Strip query params for lookup base_url = url.split("?")[0] return responses.get(base_url, FakeResponse(404, "")) monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", fake_req) result = parse_sitemap("https://acme.com", FakeClient()) assert result == "https://acme.com/jobs/senior-engineer"