"""Deterministic careers-page heuristics: URL probing, homepage scan, sitemap (Stage 2, tiers 2–4). Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join and subdomains careers.{domain}, jobs.{domain} via HTTP HEAD/GET. Tier 3 — Homepage link scan: parse anchors from the homepage HTML, rank by career/job keywords in href and text, return highest-scored link. Tier 4 — Sitemap: fetch /sitemap.xml (and sitemap index children), scan elements for career/job keywords, return the first match. """ from __future__ import annotations import logging import re from urllib.parse import urljoin, urlsplit import httpx from bs4 import BeautifulSoup from ..http import probe_url, request_with_retries logger = logging.getLogger(__name__) # URL path segments and keywords that signal a careers page. _CAREER_PATH_PATTERNS: list[re.Pattern[str]] = [ re.compile(p, re.IGNORECASE) for p in [ r"/career", # /careers, /career r"/job", # /jobs, /job-listings r"/join", # /join-us, /join-our-team r"/work-with", # /work-with-us r"/we-re-hiring", r"/openings?", r"/opportunities", r"/positions?", r"/vacancies", r"/hiring", ] ] # Weighted keyword scoring for anchor text and href paths. # Tuples: (compiled pattern, score) _HREF_WEIGHTS: list[tuple[re.Pattern[str], float]] = [ (re.compile(r"/career", re.I), 3.0), (re.compile(r"/job", re.I), 2.5), (re.compile(r"/join", re.I), 2.0), (re.compile(r"/opening", re.I), 2.0), (re.compile(r"/position", re.I), 2.0), (re.compile(r"/opportunit", re.I), 1.5), (re.compile(r"/work.with", re.I), 1.5), (re.compile(r"/hiring", re.I), 1.5), (re.compile(r"/vacanc", re.I), 1.5), ] _TEXT_WEIGHTS: list[tuple[re.Pattern[str], float]] = [ (re.compile(r"\bcareers?\b", re.I), 3.0), (re.compile(r"\bjobs?\b", re.I), 2.5), (re.compile(r"\bjoin\s+us\b", re.I), 2.0), (re.compile(r"\bopen\s+positions?\b", re.I), 2.0), (re.compile(r"\bwork\s+with\s+us\b", re.I), 1.5), (re.compile(r"\bwe.?re\s+hiring\b", re.I), 1.5), (re.compile(r"\bopportunities\b", re.I), 1.5), (re.compile(r"\bhiring\b", re.I), 1.0), ] # Minimum score to accept a homepage link as a careers page candidate. _SCORE_THRESHOLD = 2.0 # Maximum child sitemaps to fetch when processing a sitemap index. _MAX_SITEMAP_CHILDREN = 5 # Maximum entries to scan across all sitemaps. _MAX_LOC_ENTRIES = 500 # Soft-404 path indicators used to reject SPA-style error pages. _SOFT_404_INDICATORS = ("notfound", "/404", "not-found", "/pagenot", "/error-page") # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _is_plausible_careers_url(original_url: str, final_url: str) -> bool: """Return False for obvious false positives from redirect chains. Rejects: - SPA soft-404 paths: final URL path contains "notfound", "/404", "not-found" (e.g. Netflix /careers → /NotFound?prev=...) - Off-brand cross-domain redirects: final domain shares no brand with original (e.g. microsoft.com/careers → bing.com) Accepts legitimate cross-domain redirects where the brand is preserved (e.g. amazon.com → amazon.jobs). """ orig = urlsplit(original_url) final = urlsplit(final_url) # 1. Reject soft-404 path indicators (case-insensitive). final_path_lower = final.path.lower() if any(indicator in final_path_lower for indicator in _SOFT_404_INDICATORS): return False # 2. If domain changed, verify the brand name survives in the new host. orig_host = re.sub(r"^www\.", "", orig.netloc, flags=re.IGNORECASE).lower() final_host = re.sub(r"^www\.", "", final.netloc, flags=re.IGNORECASE).lower() if orig_host != final_host: brand = orig_host.split(".")[0] # e.g. "microsoft" from "microsoft.com" if len(brand) > 3 and brand not in final_host: return False return True def _base_parts(website: str) -> tuple[str, str, str]: """Return (scheme, host, root_domain) for a website URL. root_domain strips a leading 'www.' from the host so that subdomain candidates like 'careers.{root_domain}' are formed correctly. Example: 'https://www.acme.com/about' → ('https', 'www.acme.com', 'acme.com') """ parts = urlsplit(website) scheme = parts.scheme or "https" host = parts.netloc or parts.path.split("/")[0] root = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE) return scheme, host, root # --------------------------------------------------------------------------- # Tier 2 — URL-pattern probing # --------------------------------------------------------------------------- def probe_url_patterns(website: str, client: httpx.Client) -> str | None: """Probe well-known career URL paths and subdomains; return first reachable URL. Probes in order: /careers, /career, /jobs, /join-us, /join careers.{root_domain}, jobs.{root_domain} Uses HTTP HEAD with GET fallback via http.probe_url. """ scheme, host, root = _base_parts(website) base = f"{scheme}://{host}" candidates: list[str] = [ f"{base}/careers", f"{base}/career", f"{base}/jobs", f"{base}/join-us", f"{base}/join", f"{scheme}://careers.{root}", f"{scheme}://jobs.{root}", ] for candidate in candidates: result = probe_url(client, candidate) if result and _is_plausible_careers_url(candidate, result): logger.info("heuristics.probe_url_patterns(%s): hit url=%s", website, result) return result if result: logger.info( "heuristics.probe_url_patterns(%s): rejected redirect %s → %s", website, candidate, result, ) logger.info("heuristics.probe_url_patterns(%s): no pattern matched", website) return None # --------------------------------------------------------------------------- # Tier 3 — Homepage link scan # --------------------------------------------------------------------------- def scan_homepage_links( website: str, client: httpx.Client, *, homepage_html: str | None = None, ) -> str | None: """Rank anchors on the homepage by career/job keywords; return best match. If *homepage_html* is provided it is used directly. Otherwise the homepage is fetched. Returns the highest-scoring anchor href that exceeds the threshold, or None. """ html = homepage_html if html is None: try: resp = request_with_retries(client, "GET", website, max_retries=1) if resp.status_code >= 400: logger.warning( "heuristics.scan_homepage_links(%s): HTTP %s", website, resp.status_code ) return None html = resp.text except Exception as exc: logger.warning( "heuristics.scan_homepage_links(%s): fetch error: %s", website, exc ) return None if not html: return None soup = BeautifulSoup(html, "lxml") best_url: str | None = None best_score: float = 0.0 for tag in soup.find_all("a", href=True): href: str = tag["href"].strip() # Skip non-HTTP links and fragment-only anchors. if not href or href.startswith(("mailto:", "tel:", "#", "javascript:")): continue full_url = urljoin(website, href) # Keep only http(s) links. if not full_url.startswith(("http://", "https://")): continue score = _score_anchor(href, tag.get_text(separator=" ", strip=True)) if score > best_score: best_score = score best_url = full_url if best_url and best_score >= _SCORE_THRESHOLD: logger.info( "heuristics.scan_homepage_links(%s): hit url=%s score=%.1f", website, best_url, best_score, ) return best_url logger.info( "heuristics.scan_homepage_links(%s): no link above threshold (best=%.1f)", website, best_score, ) return None def _score_anchor(href: str, text: str) -> float: """Compute a relevance score for an anchor based on its href path and text.""" score = 0.0 for pattern, weight in _HREF_WEIGHTS: if pattern.search(href): score += weight for pattern, weight in _TEXT_WEIGHTS: if pattern.search(text): score += weight return score # --------------------------------------------------------------------------- # Tier 4 — Sitemap # --------------------------------------------------------------------------- _CAREER_URL_RE = re.compile( r"/(career|job|join|opening|position|opportunit|vacanc|hiring)", re.IGNORECASE, ) def parse_sitemap(website: str, client: httpx.Client) -> str | None: """Fetch /sitemap.xml and scan URLs for career/job keywords. If the sitemap is an index, fetches up to _MAX_SITEMAP_CHILDREN child sitemaps. Scans up to _MAX_LOC_ENTRIES entries in total. Returns the first matching URL, or None. """ scheme, host, _ = _base_parts(website) sitemap_url = f"{scheme}://{host}/sitemap.xml" try: xml = _fetch_xml(client, sitemap_url) except Exception as exc: logger.info("heuristics.parse_sitemap(%s): fetch error: %s", website, exc) return None if xml is None: logger.info("heuristics.parse_sitemap(%s): sitemap not found", website) return None result = _scan_sitemap_xml(xml, website, client) if result: logger.info("heuristics.parse_sitemap(%s): hit url=%s", website, result) else: logger.info("heuristics.parse_sitemap(%s): no career URL found", website) return result def _fetch_xml(client: httpx.Client, url: str) -> str | None: """GET a URL and return the text if the response is < 400, else None.""" resp = request_with_retries(client, "GET", url, max_retries=1) if resp.status_code >= 400: return None return resp.text def _scan_sitemap_xml(xml: str, website: str, client: httpx.Client) -> str | None: """Parse sitemap XML; handle sitemap index by fetching children.""" soup = BeautifulSoup(xml, "xml") # Sitemap index: contains sitemap_tags = soup.find_all("sitemap") if sitemap_tags: child_locs = [ tag.find("loc").get_text(strip=True) for tag in sitemap_tags if tag.find("loc") ] for child_url in child_locs[:_MAX_SITEMAP_CHILDREN]: try: child_xml = _fetch_xml(client, child_url) except Exception: continue if child_xml: result = _scan_locs(child_xml) if result: return result return None # Plain sitemap: contains return _scan_locs(xml) def _scan_locs(xml: str) -> str | None: """Scan elements in a sitemap for career/job keywords.""" soup = BeautifulSoup(xml, "xml") count = 0 for loc_tag in soup.find_all("loc"): if count >= _MAX_LOC_ENTRIES: break count += 1 loc: str = loc_tag.get_text(strip=True) if _CAREER_URL_RE.search(loc): return loc return None