phase2-ATS + heuristic careers finding

2026-06-17 17:33:11 -04:00
parent cd9ab9b95e
commit 113a4ced36
11 changed files with 2836 additions and 39 deletions
--- a/jobsource/careers/heuristics.py
+++ b/jobsource/careers/heuristics.py
@@ -1,11 +1,334 @@
 """Deterministic careers-page heuristics: URL probing, homepage scan, sitemap (Stage 2, tiers 2–4).

-Scaffold stub -- not implemented yet.
+Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join and
+          subdomains careers.{domain}, jobs.{domain} via HTTP HEAD/GET.
+Tier 3 — Homepage link scan: parse <a> anchors from the homepage HTML, rank
+          by career/job keywords in href and text, return highest-scored link.
+Tier 4 — Sitemap: fetch /sitemap.xml (and sitemap index children), scan <loc>
+          elements for career/job keywords, return the first match.
 """
-# TODO (Stage 2, tiers 2–4): implement per CLAUDE.md "Stage 2 — URL patterns / homepage / sitemap".
-# Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join,
-#           careers.{domain}, jobs.{domain} via HTTP HEAD (or GET if HEAD fails).
-# Tier 3 — Homepage link scan: fetch homepage HTML, parse with BeautifulSoup + lxml,
-#           rank <a> anchors by career/job keywords in href/text, return highest-ranked.
-# Tier 4 — Sitemap: fetch sitemap.xml (and sitemap index if present), scan for career/job URLs.
-# Each function returns (url: str | None) so cascade.py can return early on first hit.
+from __future__ import annotations
+
+import logging
+import re
+from urllib.parse import urljoin, urlsplit
+
+import httpx
+from bs4 import BeautifulSoup
+
+from ..http import probe_url, request_with_retries
+
+logger = logging.getLogger(__name__)
+
+# URL path segments and keywords that signal a careers page.
+_CAREER_PATH_PATTERNS: list[re.Pattern[str]] = [
+    re.compile(p, re.IGNORECASE)
+    for p in [
+        r"/career",         # /careers, /career
+        r"/job",            # /jobs, /job-listings
+        r"/join",           # /join-us, /join-our-team
+        r"/work-with",      # /work-with-us
+        r"/we-re-hiring",
+        r"/openings?",
+        r"/opportunities",
+        r"/positions?",
+        r"/vacancies",
+        r"/hiring",
+    ]
+]
+
+# Weighted keyword scoring for anchor text and href paths.
+# Tuples: (compiled pattern, score)
+_HREF_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
+    (re.compile(r"/career", re.I), 3.0),
+    (re.compile(r"/job", re.I), 2.5),
+    (re.compile(r"/join", re.I), 2.0),
+    (re.compile(r"/opening", re.I), 2.0),
+    (re.compile(r"/position", re.I), 2.0),
+    (re.compile(r"/opportunit", re.I), 1.5),
+    (re.compile(r"/work.with", re.I), 1.5),
+    (re.compile(r"/hiring", re.I), 1.5),
+    (re.compile(r"/vacanc", re.I), 1.5),
+]
+
+_TEXT_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
+    (re.compile(r"\bcareers?\b", re.I), 3.0),
+    (re.compile(r"\bjobs?\b", re.I), 2.5),
+    (re.compile(r"\bjoin\s+us\b", re.I), 2.0),
+    (re.compile(r"\bopen\s+positions?\b", re.I), 2.0),
+    (re.compile(r"\bwork\s+with\s+us\b", re.I), 1.5),
+    (re.compile(r"\bwe.?re\s+hiring\b", re.I), 1.5),
+    (re.compile(r"\bopportunities\b", re.I), 1.5),
+    (re.compile(r"\bhiring\b", re.I), 1.0),
+]
+
+# Minimum score to accept a homepage link as a careers page candidate.
+_SCORE_THRESHOLD = 2.0
+
+# Maximum child sitemaps to fetch when processing a sitemap index.
+_MAX_SITEMAP_CHILDREN = 5
+
+# Maximum <loc> entries to scan across all sitemaps.
+_MAX_LOC_ENTRIES = 500
+
+# Soft-404 path indicators used to reject SPA-style error pages.
+_SOFT_404_INDICATORS = ("notfound", "/404", "not-found", "/pagenot", "/error-page")
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_plausible_careers_url(original_url: str, final_url: str) -> bool:
+    """Return False for obvious false positives from redirect chains.
+
+    Rejects:
+    - SPA soft-404 paths: final URL path contains "notfound", "/404", "not-found"
+      (e.g. Netflix /careers → /NotFound?prev=...)
+    - Off-brand cross-domain redirects: final domain shares no brand with original
+      (e.g. microsoft.com/careers → bing.com)
+
+    Accepts legitimate cross-domain redirects where the brand is preserved
+    (e.g. amazon.com → amazon.jobs).
+    """
+    orig = urlsplit(original_url)
+    final = urlsplit(final_url)
+
+    # 1. Reject soft-404 path indicators (case-insensitive).
+    final_path_lower = final.path.lower()
+    if any(indicator in final_path_lower for indicator in _SOFT_404_INDICATORS):
+        return False
+
+    # 2. If domain changed, verify the brand name survives in the new host.
+    orig_host = re.sub(r"^www\.", "", orig.netloc, flags=re.IGNORECASE).lower()
+    final_host = re.sub(r"^www\.", "", final.netloc, flags=re.IGNORECASE).lower()
+    if orig_host != final_host:
+        brand = orig_host.split(".")[0]   # e.g. "microsoft" from "microsoft.com"
+        if len(brand) > 3 and brand not in final_host:
+            return False
+
+    return True
+
+
+def _base_parts(website: str) -> tuple[str, str, str]:
+    """Return (scheme, host, root_domain) for a website URL.
+
+    root_domain strips a leading 'www.' from the host so that subdomain
+    candidates like 'careers.{root_domain}' are formed correctly.
+    Example: 'https://www.acme.com/about' → ('https', 'www.acme.com', 'acme.com')
+    """
+    parts = urlsplit(website)
+    scheme = parts.scheme or "https"
+    host = parts.netloc or parts.path.split("/")[0]
+    root = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE)
+    return scheme, host, root
+
+
+# ---------------------------------------------------------------------------
+# Tier 2 — URL-pattern probing
+# ---------------------------------------------------------------------------
+
+
+def probe_url_patterns(website: str, client: httpx.Client) -> str | None:
+    """Probe well-known career URL paths and subdomains; return first reachable URL.
+
+    Probes in order:
+      /careers, /career, /jobs, /join-us, /join
+      careers.{root_domain}, jobs.{root_domain}
+    Uses HTTP HEAD with GET fallback via http.probe_url.
+    """
+    scheme, host, root = _base_parts(website)
+    base = f"{scheme}://{host}"
+
+    candidates: list[str] = [
+        f"{base}/careers",
+        f"{base}/career",
+        f"{base}/jobs",
+        f"{base}/join-us",
+        f"{base}/join",
+        f"{scheme}://careers.{root}",
+        f"{scheme}://jobs.{root}",
+    ]
+
+    for candidate in candidates:
+        result = probe_url(client, candidate)
+        if result and _is_plausible_careers_url(candidate, result):
+            logger.info("heuristics.probe_url_patterns(%s): hit url=%s", website, result)
+            return result
+        if result:
+            logger.info(
+                "heuristics.probe_url_patterns(%s): rejected redirect %s → %s",
+                website, candidate, result,
+            )
+
+    logger.info("heuristics.probe_url_patterns(%s): no pattern matched", website)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Tier 3 — Homepage link scan
+# ---------------------------------------------------------------------------
+
+
+def scan_homepage_links(
+    website: str,
+    client: httpx.Client,
+    *,
+    homepage_html: str | None = None,
+) -> str | None:
+    """Rank <a> anchors on the homepage by career/job keywords; return best match.
+
+    If *homepage_html* is provided it is used directly. Otherwise the homepage
+    is fetched. Returns the highest-scoring anchor href that exceeds the
+    threshold, or None.
+    """
+    html = homepage_html
+    if html is None:
+        try:
+            resp = request_with_retries(client, "GET", website, max_retries=1)
+            if resp.status_code >= 400:
+                logger.warning(
+                    "heuristics.scan_homepage_links(%s): HTTP %s", website, resp.status_code
+                )
+                return None
+            html = resp.text
+        except Exception as exc:
+            logger.warning(
+                "heuristics.scan_homepage_links(%s): fetch error: %s", website, exc
+            )
+            return None
+
+    if not html:
+        return None
+
+    soup = BeautifulSoup(html, "lxml")
+    best_url: str | None = None
+    best_score: float = 0.0
+
+    for tag in soup.find_all("a", href=True):
+        href: str = tag["href"].strip()
+        # Skip non-HTTP links and fragment-only anchors.
+        if not href or href.startswith(("mailto:", "tel:", "#", "javascript:")):
+            continue
+
+        full_url = urljoin(website, href)
+        # Keep only http(s) links.
+        if not full_url.startswith(("http://", "https://")):
+            continue
+
+        score = _score_anchor(href, tag.get_text(separator=" ", strip=True))
+        if score > best_score:
+            best_score = score
+            best_url = full_url
+
+    if best_url and best_score >= _SCORE_THRESHOLD:
+        logger.info(
+            "heuristics.scan_homepage_links(%s): hit url=%s score=%.1f",
+            website, best_url, best_score,
+        )
+        return best_url
+
+    logger.info(
+        "heuristics.scan_homepage_links(%s): no link above threshold (best=%.1f)",
+        website, best_score,
+    )
+    return None
+
+
+def _score_anchor(href: str, text: str) -> float:
+    """Compute a relevance score for an anchor based on its href path and text."""
+    score = 0.0
+    for pattern, weight in _HREF_WEIGHTS:
+        if pattern.search(href):
+            score += weight
+    for pattern, weight in _TEXT_WEIGHTS:
+        if pattern.search(text):
+            score += weight
+    return score
+
+
+# ---------------------------------------------------------------------------
+# Tier 4 — Sitemap
+# ---------------------------------------------------------------------------
+
+_CAREER_URL_RE = re.compile(
+    r"/(career|job|join|opening|position|opportunit|vacanc|hiring)",
+    re.IGNORECASE,
+)
+
+
+def parse_sitemap(website: str, client: httpx.Client) -> str | None:
+    """Fetch /sitemap.xml and scan <loc> URLs for career/job keywords.
+
+    If the sitemap is an index, fetches up to _MAX_SITEMAP_CHILDREN child
+    sitemaps. Scans up to _MAX_LOC_ENTRIES <loc> entries in total.
+    Returns the first matching URL, or None.
+    """
+    scheme, host, _ = _base_parts(website)
+    sitemap_url = f"{scheme}://{host}/sitemap.xml"
+    try:
+        xml = _fetch_xml(client, sitemap_url)
+    except Exception as exc:
+        logger.info("heuristics.parse_sitemap(%s): fetch error: %s", website, exc)
+        return None
+
+    if xml is None:
+        logger.info("heuristics.parse_sitemap(%s): sitemap not found", website)
+        return None
+
+    result = _scan_sitemap_xml(xml, website, client)
+    if result:
+        logger.info("heuristics.parse_sitemap(%s): hit url=%s", website, result)
+    else:
+        logger.info("heuristics.parse_sitemap(%s): no career URL found", website)
+    return result
+
+
+def _fetch_xml(client: httpx.Client, url: str) -> str | None:
+    """GET a URL and return the text if the response is < 400, else None."""
+    resp = request_with_retries(client, "GET", url, max_retries=1)
+    if resp.status_code >= 400:
+        return None
+    return resp.text
+
+
+def _scan_sitemap_xml(xml: str, website: str, client: httpx.Client) -> str | None:
+    """Parse sitemap XML; handle sitemap index by fetching children."""
+    soup = BeautifulSoup(xml, "xml")
+
+    # Sitemap index: contains <sitemap><loc>…</loc></sitemap>
+    sitemap_tags = soup.find_all("sitemap")
+    if sitemap_tags:
+        child_locs = [
+            tag.find("loc").get_text(strip=True)
+            for tag in sitemap_tags
+            if tag.find("loc")
+        ]
+        for child_url in child_locs[:_MAX_SITEMAP_CHILDREN]:
+            try:
+                child_xml = _fetch_xml(client, child_url)
+            except Exception:
+                continue
+            if child_xml:
+                result = _scan_locs(child_xml)
+                if result:
+                    return result
+        return None
+
+    # Plain sitemap: contains <url><loc>…</loc></url>
+    return _scan_locs(xml)
+
+
+def _scan_locs(xml: str) -> str | None:
+    """Scan <loc> elements in a sitemap for career/job keywords."""
+    soup = BeautifulSoup(xml, "xml")
+    count = 0
+    for loc_tag in soup.find_all("loc"):
+        if count >= _MAX_LOC_ENTRIES:
+            break
+        count += 1
+        loc: str = loc_tag.get_text(strip=True)
+        if _CAREER_URL_RE.search(loc):
+            return loc
+    return None