Files
JobSourceAgent/jobsource/careers/heuristics.py

335 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Deterministic careers-page heuristics: URL probing, homepage scan, sitemap (Stage 2, tiers 24).
Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join and
subdomains careers.{domain}, jobs.{domain} via HTTP HEAD/GET.
Tier 3 — Homepage link scan: parse <a> anchors from the homepage HTML, rank
by career/job keywords in href and text, return highest-scored link.
Tier 4 — Sitemap: fetch /sitemap.xml (and sitemap index children), scan <loc>
elements for career/job keywords, return the first match.
"""
from __future__ import annotations
import logging
import re
from urllib.parse import urljoin, urlsplit
import httpx
from bs4 import BeautifulSoup
from ..http import probe_url, request_with_retries
logger = logging.getLogger(__name__)
# URL path segments and keywords that signal a careers page.
_CAREER_PATH_PATTERNS: list[re.Pattern[str]] = [
re.compile(p, re.IGNORECASE)
for p in [
r"/career", # /careers, /career
r"/job", # /jobs, /job-listings
r"/join", # /join-us, /join-our-team
r"/work-with", # /work-with-us
r"/we-re-hiring",
r"/openings?",
r"/opportunities",
r"/positions?",
r"/vacancies",
r"/hiring",
]
]
# Weighted keyword scoring for anchor text and href paths.
# Tuples: (compiled pattern, score)
_HREF_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
(re.compile(r"/career", re.I), 3.0),
(re.compile(r"/job", re.I), 2.5),
(re.compile(r"/join", re.I), 2.0),
(re.compile(r"/opening", re.I), 2.0),
(re.compile(r"/position", re.I), 2.0),
(re.compile(r"/opportunit", re.I), 1.5),
(re.compile(r"/work.with", re.I), 1.5),
(re.compile(r"/hiring", re.I), 1.5),
(re.compile(r"/vacanc", re.I), 1.5),
]
_TEXT_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
(re.compile(r"\bcareers?\b", re.I), 3.0),
(re.compile(r"\bjobs?\b", re.I), 2.5),
(re.compile(r"\bjoin\s+us\b", re.I), 2.0),
(re.compile(r"\bopen\s+positions?\b", re.I), 2.0),
(re.compile(r"\bwork\s+with\s+us\b", re.I), 1.5),
(re.compile(r"\bwe.?re\s+hiring\b", re.I), 1.5),
(re.compile(r"\bopportunities\b", re.I), 1.5),
(re.compile(r"\bhiring\b", re.I), 1.0),
]
# Minimum score to accept a homepage link as a careers page candidate.
_SCORE_THRESHOLD = 2.0
# Maximum child sitemaps to fetch when processing a sitemap index.
_MAX_SITEMAP_CHILDREN = 5
# Maximum <loc> entries to scan across all sitemaps.
_MAX_LOC_ENTRIES = 500
# Soft-404 path indicators used to reject SPA-style error pages.
_SOFT_404_INDICATORS = ("notfound", "/404", "not-found", "/pagenot", "/error-page")
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _is_plausible_careers_url(original_url: str, final_url: str) -> bool:
"""Return False for obvious false positives from redirect chains.
Rejects:
- SPA soft-404 paths: final URL path contains "notfound", "/404", "not-found"
(e.g. Netflix /careers → /NotFound?prev=...)
- Off-brand cross-domain redirects: final domain shares no brand with original
(e.g. microsoft.com/careers → bing.com)
Accepts legitimate cross-domain redirects where the brand is preserved
(e.g. amazon.com → amazon.jobs).
"""
orig = urlsplit(original_url)
final = urlsplit(final_url)
# 1. Reject soft-404 path indicators (case-insensitive).
final_path_lower = final.path.lower()
if any(indicator in final_path_lower for indicator in _SOFT_404_INDICATORS):
return False
# 2. If domain changed, verify the brand name survives in the new host.
orig_host = re.sub(r"^www\.", "", orig.netloc, flags=re.IGNORECASE).lower()
final_host = re.sub(r"^www\.", "", final.netloc, flags=re.IGNORECASE).lower()
if orig_host != final_host:
brand = orig_host.split(".")[0] # e.g. "microsoft" from "microsoft.com"
if len(brand) > 3 and brand not in final_host:
return False
return True
def _base_parts(website: str) -> tuple[str, str, str]:
"""Return (scheme, host, root_domain) for a website URL.
root_domain strips a leading 'www.' from the host so that subdomain
candidates like 'careers.{root_domain}' are formed correctly.
Example: 'https://www.acme.com/about' → ('https', 'www.acme.com', 'acme.com')
"""
parts = urlsplit(website)
scheme = parts.scheme or "https"
host = parts.netloc or parts.path.split("/")[0]
root = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE)
return scheme, host, root
# ---------------------------------------------------------------------------
# Tier 2 — URL-pattern probing
# ---------------------------------------------------------------------------
def probe_url_patterns(website: str, client: httpx.Client) -> str | None:
"""Probe well-known career URL paths and subdomains; return first reachable URL.
Probes in order:
/careers, /career, /jobs, /join-us, /join
careers.{root_domain}, jobs.{root_domain}
Uses HTTP HEAD with GET fallback via http.probe_url.
"""
scheme, host, root = _base_parts(website)
base = f"{scheme}://{host}"
candidates: list[str] = [
f"{base}/careers",
f"{base}/career",
f"{base}/jobs",
f"{base}/join-us",
f"{base}/join",
f"{scheme}://careers.{root}",
f"{scheme}://jobs.{root}",
]
for candidate in candidates:
result = probe_url(client, candidate)
if result and _is_plausible_careers_url(candidate, result):
logger.info("heuristics.probe_url_patterns(%s): hit url=%s", website, result)
return result
if result:
logger.info(
"heuristics.probe_url_patterns(%s): rejected redirect %s%s",
website, candidate, result,
)
logger.info("heuristics.probe_url_patterns(%s): no pattern matched", website)
return None
# ---------------------------------------------------------------------------
# Tier 3 — Homepage link scan
# ---------------------------------------------------------------------------
def scan_homepage_links(
website: str,
client: httpx.Client,
*,
homepage_html: str | None = None,
) -> str | None:
"""Rank <a> anchors on the homepage by career/job keywords; return best match.
If *homepage_html* is provided it is used directly. Otherwise the homepage
is fetched. Returns the highest-scoring anchor href that exceeds the
threshold, or None.
"""
html = homepage_html
if html is None:
try:
resp = request_with_retries(client, "GET", website, max_retries=1)
if resp.status_code >= 400:
logger.warning(
"heuristics.scan_homepage_links(%s): HTTP %s", website, resp.status_code
)
return None
html = resp.text
except Exception as exc:
logger.warning(
"heuristics.scan_homepage_links(%s): fetch error: %s", website, exc
)
return None
if not html:
return None
soup = BeautifulSoup(html, "lxml")
best_url: str | None = None
best_score: float = 0.0
for tag in soup.find_all("a", href=True):
href: str = tag["href"].strip()
# Skip non-HTTP links and fragment-only anchors.
if not href or href.startswith(("mailto:", "tel:", "#", "javascript:")):
continue
full_url = urljoin(website, href)
# Keep only http(s) links.
if not full_url.startswith(("http://", "https://")):
continue
score = _score_anchor(href, tag.get_text(separator=" ", strip=True))
if score > best_score:
best_score = score
best_url = full_url
if best_url and best_score >= _SCORE_THRESHOLD:
logger.info(
"heuristics.scan_homepage_links(%s): hit url=%s score=%.1f",
website, best_url, best_score,
)
return best_url
logger.info(
"heuristics.scan_homepage_links(%s): no link above threshold (best=%.1f)",
website, best_score,
)
return None
def _score_anchor(href: str, text: str) -> float:
"""Compute a relevance score for an anchor based on its href path and text."""
score = 0.0
for pattern, weight in _HREF_WEIGHTS:
if pattern.search(href):
score += weight
for pattern, weight in _TEXT_WEIGHTS:
if pattern.search(text):
score += weight
return score
# ---------------------------------------------------------------------------
# Tier 4 — Sitemap
# ---------------------------------------------------------------------------
_CAREER_URL_RE = re.compile(
r"/(career|job|join|opening|position|opportunit|vacanc|hiring)",
re.IGNORECASE,
)
def parse_sitemap(website: str, client: httpx.Client) -> str | None:
"""Fetch /sitemap.xml and scan <loc> URLs for career/job keywords.
If the sitemap is an index, fetches up to _MAX_SITEMAP_CHILDREN child
sitemaps. Scans up to _MAX_LOC_ENTRIES <loc> entries in total.
Returns the first matching URL, or None.
"""
scheme, host, _ = _base_parts(website)
sitemap_url = f"{scheme}://{host}/sitemap.xml"
try:
xml = _fetch_xml(client, sitemap_url)
except Exception as exc:
logger.info("heuristics.parse_sitemap(%s): fetch error: %s", website, exc)
return None
if xml is None:
logger.info("heuristics.parse_sitemap(%s): sitemap not found", website)
return None
result = _scan_sitemap_xml(xml, website, client)
if result:
logger.info("heuristics.parse_sitemap(%s): hit url=%s", website, result)
else:
logger.info("heuristics.parse_sitemap(%s): no career URL found", website)
return result
def _fetch_xml(client: httpx.Client, url: str) -> str | None:
"""GET a URL and return the text if the response is < 400, else None."""
resp = request_with_retries(client, "GET", url, max_retries=1)
if resp.status_code >= 400:
return None
return resp.text
def _scan_sitemap_xml(xml: str, website: str, client: httpx.Client) -> str | None:
"""Parse sitemap XML; handle sitemap index by fetching children."""
soup = BeautifulSoup(xml, "xml")
# Sitemap index: contains <sitemap><loc>…</loc></sitemap>
sitemap_tags = soup.find_all("sitemap")
if sitemap_tags:
child_locs = [
tag.find("loc").get_text(strip=True)
for tag in sitemap_tags
if tag.find("loc")
]
for child_url in child_locs[:_MAX_SITEMAP_CHILDREN]:
try:
child_xml = _fetch_xml(client, child_url)
except Exception:
continue
if child_xml:
result = _scan_locs(child_xml)
if result:
return result
return None
# Plain sitemap: contains <url><loc>…</loc></url>
return _scan_locs(xml)
def _scan_locs(xml: str) -> str | None:
"""Scan <loc> elements in a sitemap for career/job keywords."""
soup = BeautifulSoup(xml, "xml")
count = 0
for loc_tag in soup.find_all("loc"):
if count >= _MAX_LOC_ENTRIES:
break
count += 1
loc: str = loc_tag.get_text(strip=True)
if _CAREER_URL_RE.search(loc):
return loc
return None