Files
JobSourceAgent/tests/test_heuristics.py

426 lines
15 KiB
Python

"""Tests for jobsource/careers/heuristics.py — all network-free."""
from __future__ import annotations
import pytest
from jobsource.careers.heuristics import (
_base_parts,
_is_plausible_careers_url,
_score_anchor,
parse_sitemap,
probe_url_patterns,
scan_homepage_links,
)
# ---------------------------------------------------------------------------
# Tiny fake response helper
# ---------------------------------------------------------------------------
class FakeResponse:
def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"):
self.status_code = status_code
self.text = text
self.url = url
def json(self) -> object:
return {}
class FakeClient:
pass
# ---------------------------------------------------------------------------
# _is_plausible_careers_url
# ---------------------------------------------------------------------------
class TestIsPlausibleCareersUrl:
def test_same_domain_clean_path(self):
assert _is_plausible_careers_url("https://netflix.com/careers", "https://netflix.com/careers") is True
def test_same_domain_deeper_path(self):
# Google /careers → /about/careers/applications/
assert _is_plausible_careers_url(
"https://google.com/careers",
"https://google.com/about/careers/applications/",
) is True
def test_soft_404_notfound_rejected(self):
# Netflix SPA: /careers → /NotFound?prev=...
assert _is_plausible_careers_url(
"https://www.netflix.com/careers",
"https://www.netflix.com/NotFound?prev=https%3A%2F%2Fwww.netflix.com%2Fcareers",
) is False
def test_soft_404_slash404_rejected(self):
assert _is_plausible_careers_url(
"https://example.com/jobs",
"https://example.com/404",
) is False
def test_soft_404_not_found_hyphen_rejected(self):
assert _is_plausible_careers_url(
"https://example.com/jobs",
"https://example.com/not-found",
) is False
def test_off_brand_cross_domain_rejected(self):
# Microsoft /careers → bing.com
assert _is_plausible_careers_url(
"https://www.microsoft.com/careers",
"https://www.bing.com?ref=aka&shorturl=abc",
) is False
def test_on_brand_cross_domain_accepted(self):
# Amazon /careers → amazon.jobs
assert _is_plausible_careers_url(
"https://www.amazon.com/careers",
"https://amazon.jobs/en/",
) is True
def test_workday_subdomain_accepted(self):
# nvidia.com/careers → nvidia.wd5.myworkdayjobs.com/...
assert _is_plausible_careers_url(
"https://www.nvidia.com/careers",
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
) is True
def test_short_brand_not_filtered(self):
# brand <= 3 chars: skip the off-brand check
assert _is_plausible_careers_url(
"https://ibm.com/careers",
"https://jobs.example.com/",
) is True # "ibm" len=3 → no cross-domain filter applied
# ---------------------------------------------------------------------------
# _base_parts
# ---------------------------------------------------------------------------
class TestBaseParts:
def test_strips_www(self):
scheme, host, root = _base_parts("https://www.acme.com/about")
assert scheme == "https"
assert host == "www.acme.com"
assert root == "acme.com"
def test_no_www(self):
_, _, root = _base_parts("https://acme.com")
assert root == "acme.com"
def test_subdomain_preserved_in_host(self):
_, host, root = _base_parts("https://careers.acme.com/jobs")
assert host == "careers.acme.com"
assert root == "careers.acme.com" # www. stripping only
def test_http_scheme(self):
scheme, _, _ = _base_parts("http://acme.com")
assert scheme == "http"
# ---------------------------------------------------------------------------
# _score_anchor
# ---------------------------------------------------------------------------
class TestScoreAnchor:
def test_careers_href_high_score(self):
assert _score_anchor("/careers", "") > 2.0
def test_jobs_href_score(self):
assert _score_anchor("/jobs", "") > 0
def test_text_careers_adds_score(self):
score_with = _score_anchor("/x", "Careers")
score_without = _score_anchor("/x", "About")
assert score_with > score_without
def test_unrelated_href_and_text_zero(self):
assert _score_anchor("/about", "About us") == 0.0
def test_combined_href_and_text(self):
combined = _score_anchor("/careers", "Careers")
href_only = _score_anchor("/careers", "")
assert combined > href_only
# ---------------------------------------------------------------------------
# probe_url_patterns
# ---------------------------------------------------------------------------
class TestProbeUrlPatterns:
def test_returns_careers_path_on_hit(self, monkeypatch):
def fake_probe(client, url):
if url.endswith("/careers"):
return "https://acme.com/careers"
return None
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
result = probe_url_patterns("https://acme.com", FakeClient())
assert result == "https://acme.com/careers"
def test_returns_none_when_all_miss(self, monkeypatch):
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", lambda c, u: None)
result = probe_url_patterns("https://acme.com", FakeClient())
assert result is None
def test_returns_first_hit_not_second(self, monkeypatch):
hits = []
def fake_probe(client, url):
hits.append(url)
if url.endswith("/career"):
return "https://acme.com/career"
return None
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
result = probe_url_patterns("https://acme.com", FakeClient())
assert result == "https://acme.com/career"
# /careers was probed first and missed; /career was the first hit
assert any("/careers" in u for u in hits)
def test_careers_subdomain_candidate_included(self, monkeypatch):
probed: list[str] = []
def fake_probe(client, url):
probed.append(url)
return None
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
probe_url_patterns("https://www.acme.com", FakeClient())
assert any("careers.acme.com" in u for u in probed)
assert any("jobs.acme.com" in u for u in probed)
def test_jobs_subdomain_hit(self, monkeypatch):
def fake_probe(client, url):
if "jobs.acme.com" in url:
return "https://jobs.acme.com"
return None
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
result = probe_url_patterns("https://www.acme.com", FakeClient())
assert result == "https://jobs.acme.com"
def test_soft_404_rejected_falls_through_to_none(self, monkeypatch):
# All candidates redirect to a NotFound page — should return None
monkeypatch.setattr(
"jobsource.careers.heuristics.probe_url",
lambda c, u: u.replace("/careers", "/NotFound").replace("/career", "/NotFound")
.replace("/jobs", "/NotFound").replace("/join-us", "/NotFound")
.replace("/join", "/NotFound"),
)
result = probe_url_patterns("https://www.netflix.com", FakeClient())
assert result is None
def test_off_brand_redirect_rejected(self, monkeypatch):
# /careers redirects to an off-brand domain → skip; later candidate hits
def fake_probe(client, url):
if url.endswith("/careers"):
return "https://www.bing.com?ref=aka"
if url.endswith("/jobs"):
return "https://acme.com/jobs"
return None
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
result = probe_url_patterns("https://www.acme.com", FakeClient())
assert result == "https://acme.com/jobs"
# ---------------------------------------------------------------------------
# scan_homepage_links
# ---------------------------------------------------------------------------
HOMEPAGE_WITH_CAREERS = """
<html><body>
<nav>
<a href="/about">About</a>
<a href="/careers">Careers</a>
<a href="/blog">Blog</a>
</nav>
</body></html>
"""
HOMEPAGE_NO_CAREER_LINKS = """
<html><body>
<nav>
<a href="/about">About</a>
<a href="/blog">Blog</a>
<a href="/contact">Contact</a>
</nav>
</body></html>
"""
HOMEPAGE_RELATIVE_LINK = """
<html><body>
<a href="jobs">Open Jobs</a>
</body></html>
"""
HOMEPAGE_ABSOLUTE_LEVER = """
<html><body>
<a href="https://jobs.lever.co/acme">Work with us</a>
</body></html>
"""
class TestScanHomepageLinks:
def test_finds_careers_link(self):
result = scan_homepage_links(
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_WITH_CAREERS
)
assert result is not None
assert "careers" in result
def test_no_career_links_returns_none(self):
result = scan_homepage_links(
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_NO_CAREER_LINKS
)
assert result is None
def test_relative_href_resolved(self):
result = scan_homepage_links(
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_RELATIVE_LINK
)
# "jobs" href + "Open Jobs" text should score above threshold
assert result is not None
assert result.startswith("https://acme.com")
def test_absolute_external_link_preserved(self):
result = scan_homepage_links(
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_ABSOLUTE_LEVER
)
assert result == "https://jobs.lever.co/acme"
def test_empty_html_returns_none(self):
result = scan_homepage_links(
"https://acme.com", FakeClient(), homepage_html=""
)
assert result is None
def test_fetch_failure_returns_none(self, monkeypatch):
def boom(*a, **kw):
raise RuntimeError("connection refused")
monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom)
result = scan_homepage_links("https://acme.com", FakeClient())
assert result is None
def test_http_error_status_returns_none(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.heuristics.request_with_retries",
lambda *a, **kw: FakeResponse(500, "error"),
)
result = scan_homepage_links("https://acme.com", FakeClient())
assert result is None
def test_skips_mailto_links(self):
html = '<html><body><a href="mailto:jobs@acme.com">Email us</a></body></html>'
result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
assert result is None
def test_skips_fragment_only_links(self):
html = '<html><body><a href="#careers">Careers</a></body></html>'
result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
assert result is None
def test_prefers_href_careers_over_unrelated_text(self):
# /careers in href scores; /about with neutral text should score lower.
html = """
<html><body>
<a href="/about">Company information</a>
<a href="/careers">Some random text</a>
</body></html>
"""
result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
assert result is not None
assert "careers" in result
# ---------------------------------------------------------------------------
# parse_sitemap
# ---------------------------------------------------------------------------
SIMPLE_SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://acme.com/about</loc></url>
<url><loc>https://acme.com/careers</loc></url>
<url><loc>https://acme.com/blog</loc></url>
</urlset>"""
SITEMAP_NO_CAREERS = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://acme.com/about</loc></url>
<url><loc>https://acme.com/blog</loc></url>
</urlset>"""
SITEMAP_INDEX = """<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>https://acme.com/sitemap-pages.xml</loc></sitemap>
<sitemap><loc>https://acme.com/sitemap-jobs.xml</loc></sitemap>
</sitemapindex>"""
SITEMAP_JOBS_CHILD = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://acme.com/jobs/senior-engineer</loc></url>
</urlset>"""
SITEMAP_PAGES_CHILD = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://acme.com/about</loc></url>
</urlset>"""
class TestParseSitemap:
def test_finds_careers_url(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.heuristics.request_with_retries",
lambda *a, **kw: FakeResponse(200, SIMPLE_SITEMAP, "https://acme.com/sitemap.xml"),
)
result = parse_sitemap("https://acme.com", FakeClient())
assert result == "https://acme.com/careers"
def test_no_careers_url_returns_none(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.heuristics.request_with_retries",
lambda *a, **kw: FakeResponse(200, SITEMAP_NO_CAREERS),
)
result = parse_sitemap("https://acme.com", FakeClient())
assert result is None
def test_404_returns_none(self, monkeypatch):
monkeypatch.setattr(
"jobsource.careers.heuristics.request_with_retries",
lambda *a, **kw: FakeResponse(404, ""),
)
result = parse_sitemap("https://acme.com", FakeClient())
assert result is None
def test_network_error_returns_none(self, monkeypatch):
def boom(*a, **kw):
raise RuntimeError("timeout")
monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom)
result = parse_sitemap("https://acme.com", FakeClient())
assert result is None
def test_sitemap_index_fetches_children(self, monkeypatch):
responses = {
"https://acme.com/sitemap.xml": FakeResponse(200, SITEMAP_INDEX),
"https://acme.com/sitemap-pages.xml": FakeResponse(200, SITEMAP_PAGES_CHILD),
"https://acme.com/sitemap-jobs.xml": FakeResponse(200, SITEMAP_JOBS_CHILD),
}
def fake_req(client, method, url, **kw):
# Strip query params for lookup
base_url = url.split("?")[0]
return responses.get(base_url, FakeResponse(404, ""))
monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", fake_req)
result = parse_sitemap("https://acme.com", FakeClient())
assert result == "https://acme.com/jobs/senior-engineer"