426 lines
15 KiB
Python
426 lines
15 KiB
Python
"""Tests for jobsource/careers/heuristics.py — all network-free."""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from jobsource.careers.heuristics import (
|
|
_base_parts,
|
|
_is_plausible_careers_url,
|
|
_score_anchor,
|
|
parse_sitemap,
|
|
probe_url_patterns,
|
|
scan_homepage_links,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tiny fake response helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class FakeResponse:
|
|
def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"):
|
|
self.status_code = status_code
|
|
self.text = text
|
|
self.url = url
|
|
|
|
def json(self) -> object:
|
|
return {}
|
|
|
|
|
|
class FakeClient:
|
|
pass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _is_plausible_careers_url
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestIsPlausibleCareersUrl:
|
|
def test_same_domain_clean_path(self):
|
|
assert _is_plausible_careers_url("https://netflix.com/careers", "https://netflix.com/careers") is True
|
|
|
|
def test_same_domain_deeper_path(self):
|
|
# Google /careers → /about/careers/applications/
|
|
assert _is_plausible_careers_url(
|
|
"https://google.com/careers",
|
|
"https://google.com/about/careers/applications/",
|
|
) is True
|
|
|
|
def test_soft_404_notfound_rejected(self):
|
|
# Netflix SPA: /careers → /NotFound?prev=...
|
|
assert _is_plausible_careers_url(
|
|
"https://www.netflix.com/careers",
|
|
"https://www.netflix.com/NotFound?prev=https%3A%2F%2Fwww.netflix.com%2Fcareers",
|
|
) is False
|
|
|
|
def test_soft_404_slash404_rejected(self):
|
|
assert _is_plausible_careers_url(
|
|
"https://example.com/jobs",
|
|
"https://example.com/404",
|
|
) is False
|
|
|
|
def test_soft_404_not_found_hyphen_rejected(self):
|
|
assert _is_plausible_careers_url(
|
|
"https://example.com/jobs",
|
|
"https://example.com/not-found",
|
|
) is False
|
|
|
|
def test_off_brand_cross_domain_rejected(self):
|
|
# Microsoft /careers → bing.com
|
|
assert _is_plausible_careers_url(
|
|
"https://www.microsoft.com/careers",
|
|
"https://www.bing.com?ref=aka&shorturl=abc",
|
|
) is False
|
|
|
|
def test_on_brand_cross_domain_accepted(self):
|
|
# Amazon /careers → amazon.jobs
|
|
assert _is_plausible_careers_url(
|
|
"https://www.amazon.com/careers",
|
|
"https://amazon.jobs/en/",
|
|
) is True
|
|
|
|
def test_workday_subdomain_accepted(self):
|
|
# nvidia.com/careers → nvidia.wd5.myworkdayjobs.com/...
|
|
assert _is_plausible_careers_url(
|
|
"https://www.nvidia.com/careers",
|
|
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
|
|
) is True
|
|
|
|
def test_short_brand_not_filtered(self):
|
|
# brand <= 3 chars: skip the off-brand check
|
|
assert _is_plausible_careers_url(
|
|
"https://ibm.com/careers",
|
|
"https://jobs.example.com/",
|
|
) is True # "ibm" len=3 → no cross-domain filter applied
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _base_parts
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestBaseParts:
|
|
def test_strips_www(self):
|
|
scheme, host, root = _base_parts("https://www.acme.com/about")
|
|
assert scheme == "https"
|
|
assert host == "www.acme.com"
|
|
assert root == "acme.com"
|
|
|
|
def test_no_www(self):
|
|
_, _, root = _base_parts("https://acme.com")
|
|
assert root == "acme.com"
|
|
|
|
def test_subdomain_preserved_in_host(self):
|
|
_, host, root = _base_parts("https://careers.acme.com/jobs")
|
|
assert host == "careers.acme.com"
|
|
assert root == "careers.acme.com" # www. stripping only
|
|
|
|
def test_http_scheme(self):
|
|
scheme, _, _ = _base_parts("http://acme.com")
|
|
assert scheme == "http"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _score_anchor
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestScoreAnchor:
|
|
def test_careers_href_high_score(self):
|
|
assert _score_anchor("/careers", "") > 2.0
|
|
|
|
def test_jobs_href_score(self):
|
|
assert _score_anchor("/jobs", "") > 0
|
|
|
|
def test_text_careers_adds_score(self):
|
|
score_with = _score_anchor("/x", "Careers")
|
|
score_without = _score_anchor("/x", "About")
|
|
assert score_with > score_without
|
|
|
|
def test_unrelated_href_and_text_zero(self):
|
|
assert _score_anchor("/about", "About us") == 0.0
|
|
|
|
def test_combined_href_and_text(self):
|
|
combined = _score_anchor("/careers", "Careers")
|
|
href_only = _score_anchor("/careers", "")
|
|
assert combined > href_only
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# probe_url_patterns
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestProbeUrlPatterns:
|
|
def test_returns_careers_path_on_hit(self, monkeypatch):
|
|
def fake_probe(client, url):
|
|
if url.endswith("/careers"):
|
|
return "https://acme.com/careers"
|
|
return None
|
|
|
|
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
|
|
result = probe_url_patterns("https://acme.com", FakeClient())
|
|
assert result == "https://acme.com/careers"
|
|
|
|
def test_returns_none_when_all_miss(self, monkeypatch):
|
|
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", lambda c, u: None)
|
|
result = probe_url_patterns("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_returns_first_hit_not_second(self, monkeypatch):
|
|
hits = []
|
|
|
|
def fake_probe(client, url):
|
|
hits.append(url)
|
|
if url.endswith("/career"):
|
|
return "https://acme.com/career"
|
|
return None
|
|
|
|
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
|
|
result = probe_url_patterns("https://acme.com", FakeClient())
|
|
assert result == "https://acme.com/career"
|
|
# /careers was probed first and missed; /career was the first hit
|
|
assert any("/careers" in u for u in hits)
|
|
|
|
def test_careers_subdomain_candidate_included(self, monkeypatch):
|
|
probed: list[str] = []
|
|
|
|
def fake_probe(client, url):
|
|
probed.append(url)
|
|
return None
|
|
|
|
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
|
|
probe_url_patterns("https://www.acme.com", FakeClient())
|
|
assert any("careers.acme.com" in u for u in probed)
|
|
assert any("jobs.acme.com" in u for u in probed)
|
|
|
|
def test_jobs_subdomain_hit(self, monkeypatch):
|
|
def fake_probe(client, url):
|
|
if "jobs.acme.com" in url:
|
|
return "https://jobs.acme.com"
|
|
return None
|
|
|
|
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
|
|
result = probe_url_patterns("https://www.acme.com", FakeClient())
|
|
assert result == "https://jobs.acme.com"
|
|
|
|
def test_soft_404_rejected_falls_through_to_none(self, monkeypatch):
|
|
# All candidates redirect to a NotFound page — should return None
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.heuristics.probe_url",
|
|
lambda c, u: u.replace("/careers", "/NotFound").replace("/career", "/NotFound")
|
|
.replace("/jobs", "/NotFound").replace("/join-us", "/NotFound")
|
|
.replace("/join", "/NotFound"),
|
|
)
|
|
result = probe_url_patterns("https://www.netflix.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_off_brand_redirect_rejected(self, monkeypatch):
|
|
# /careers redirects to an off-brand domain → skip; later candidate hits
|
|
def fake_probe(client, url):
|
|
if url.endswith("/careers"):
|
|
return "https://www.bing.com?ref=aka"
|
|
if url.endswith("/jobs"):
|
|
return "https://acme.com/jobs"
|
|
return None
|
|
|
|
monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe)
|
|
result = probe_url_patterns("https://www.acme.com", FakeClient())
|
|
assert result == "https://acme.com/jobs"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# scan_homepage_links
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
HOMEPAGE_WITH_CAREERS = """
|
|
<html><body>
|
|
<nav>
|
|
<a href="/about">About</a>
|
|
<a href="/careers">Careers</a>
|
|
<a href="/blog">Blog</a>
|
|
</nav>
|
|
</body></html>
|
|
"""
|
|
|
|
HOMEPAGE_NO_CAREER_LINKS = """
|
|
<html><body>
|
|
<nav>
|
|
<a href="/about">About</a>
|
|
<a href="/blog">Blog</a>
|
|
<a href="/contact">Contact</a>
|
|
</nav>
|
|
</body></html>
|
|
"""
|
|
|
|
HOMEPAGE_RELATIVE_LINK = """
|
|
<html><body>
|
|
<a href="jobs">Open Jobs</a>
|
|
</body></html>
|
|
"""
|
|
|
|
HOMEPAGE_ABSOLUTE_LEVER = """
|
|
<html><body>
|
|
<a href="https://jobs.lever.co/acme">Work with us</a>
|
|
</body></html>
|
|
"""
|
|
|
|
|
|
class TestScanHomepageLinks:
|
|
def test_finds_careers_link(self):
|
|
result = scan_homepage_links(
|
|
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_WITH_CAREERS
|
|
)
|
|
assert result is not None
|
|
assert "careers" in result
|
|
|
|
def test_no_career_links_returns_none(self):
|
|
result = scan_homepage_links(
|
|
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_NO_CAREER_LINKS
|
|
)
|
|
assert result is None
|
|
|
|
def test_relative_href_resolved(self):
|
|
result = scan_homepage_links(
|
|
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_RELATIVE_LINK
|
|
)
|
|
# "jobs" href + "Open Jobs" text should score above threshold
|
|
assert result is not None
|
|
assert result.startswith("https://acme.com")
|
|
|
|
def test_absolute_external_link_preserved(self):
|
|
result = scan_homepage_links(
|
|
"https://acme.com", FakeClient(), homepage_html=HOMEPAGE_ABSOLUTE_LEVER
|
|
)
|
|
assert result == "https://jobs.lever.co/acme"
|
|
|
|
def test_empty_html_returns_none(self):
|
|
result = scan_homepage_links(
|
|
"https://acme.com", FakeClient(), homepage_html=""
|
|
)
|
|
assert result is None
|
|
|
|
def test_fetch_failure_returns_none(self, monkeypatch):
|
|
def boom(*a, **kw):
|
|
raise RuntimeError("connection refused")
|
|
monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom)
|
|
result = scan_homepage_links("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_http_error_status_returns_none(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.heuristics.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(500, "error"),
|
|
)
|
|
result = scan_homepage_links("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_skips_mailto_links(self):
|
|
html = '<html><body><a href="mailto:jobs@acme.com">Email us</a></body></html>'
|
|
result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
|
|
assert result is None
|
|
|
|
def test_skips_fragment_only_links(self):
|
|
html = '<html><body><a href="#careers">Careers</a></body></html>'
|
|
result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
|
|
assert result is None
|
|
|
|
def test_prefers_href_careers_over_unrelated_text(self):
|
|
# /careers in href scores; /about with neutral text should score lower.
|
|
html = """
|
|
<html><body>
|
|
<a href="/about">Company information</a>
|
|
<a href="/careers">Some random text</a>
|
|
</body></html>
|
|
"""
|
|
result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html)
|
|
assert result is not None
|
|
assert "careers" in result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_sitemap
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
SIMPLE_SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://acme.com/about</loc></url>
|
|
<url><loc>https://acme.com/careers</loc></url>
|
|
<url><loc>https://acme.com/blog</loc></url>
|
|
</urlset>"""
|
|
|
|
SITEMAP_NO_CAREERS = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://acme.com/about</loc></url>
|
|
<url><loc>https://acme.com/blog</loc></url>
|
|
</urlset>"""
|
|
|
|
SITEMAP_INDEX = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<sitemap><loc>https://acme.com/sitemap-pages.xml</loc></sitemap>
|
|
<sitemap><loc>https://acme.com/sitemap-jobs.xml</loc></sitemap>
|
|
</sitemapindex>"""
|
|
|
|
SITEMAP_JOBS_CHILD = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://acme.com/jobs/senior-engineer</loc></url>
|
|
</urlset>"""
|
|
|
|
SITEMAP_PAGES_CHILD = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://acme.com/about</loc></url>
|
|
</urlset>"""
|
|
|
|
|
|
class TestParseSitemap:
|
|
def test_finds_careers_url(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.heuristics.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(200, SIMPLE_SITEMAP, "https://acme.com/sitemap.xml"),
|
|
)
|
|
result = parse_sitemap("https://acme.com", FakeClient())
|
|
assert result == "https://acme.com/careers"
|
|
|
|
def test_no_careers_url_returns_none(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.heuristics.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(200, SITEMAP_NO_CAREERS),
|
|
)
|
|
result = parse_sitemap("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_404_returns_none(self, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"jobsource.careers.heuristics.request_with_retries",
|
|
lambda *a, **kw: FakeResponse(404, ""),
|
|
)
|
|
result = parse_sitemap("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_network_error_returns_none(self, monkeypatch):
|
|
def boom(*a, **kw):
|
|
raise RuntimeError("timeout")
|
|
monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom)
|
|
result = parse_sitemap("https://acme.com", FakeClient())
|
|
assert result is None
|
|
|
|
def test_sitemap_index_fetches_children(self, monkeypatch):
|
|
responses = {
|
|
"https://acme.com/sitemap.xml": FakeResponse(200, SITEMAP_INDEX),
|
|
"https://acme.com/sitemap-pages.xml": FakeResponse(200, SITEMAP_PAGES_CHILD),
|
|
"https://acme.com/sitemap-jobs.xml": FakeResponse(200, SITEMAP_JOBS_CHILD),
|
|
}
|
|
|
|
def fake_req(client, method, url, **kw):
|
|
# Strip query params for lookup
|
|
base_url = url.split("?")[0]
|
|
return responses.get(base_url, FakeResponse(404, ""))
|
|
|
|
monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", fake_req)
|
|
result = parse_sitemap("https://acme.com", FakeClient())
|
|
assert result == "https://acme.com/jobs/senior-engineer"
|