phase2-ATS + heuristic careers finding
This commit is contained in:
@@ -1,11 +1,334 @@
|
||||
"""Deterministic careers-page heuristics: URL probing, homepage scan, sitemap (Stage 2, tiers 2–4).
|
||||
|
||||
Scaffold stub -- not implemented yet.
|
||||
Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join and
|
||||
subdomains careers.{domain}, jobs.{domain} via HTTP HEAD/GET.
|
||||
Tier 3 — Homepage link scan: parse <a> anchors from the homepage HTML, rank
|
||||
by career/job keywords in href and text, return highest-scored link.
|
||||
Tier 4 — Sitemap: fetch /sitemap.xml (and sitemap index children), scan <loc>
|
||||
elements for career/job keywords, return the first match.
|
||||
"""
|
||||
# TODO (Stage 2, tiers 2–4): implement per CLAUDE.md "Stage 2 — URL patterns / homepage / sitemap".
|
||||
# Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join,
|
||||
# careers.{domain}, jobs.{domain} via HTTP HEAD (or GET if HEAD fails).
|
||||
# Tier 3 — Homepage link scan: fetch homepage HTML, parse with BeautifulSoup + lxml,
|
||||
# rank <a> anchors by career/job keywords in href/text, return highest-ranked.
|
||||
# Tier 4 — Sitemap: fetch sitemap.xml (and sitemap index if present), scan for career/job URLs.
|
||||
# Each function returns (url: str | None) so cascade.py can return early on first hit.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urljoin, urlsplit
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ..http import probe_url, request_with_retries
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# URL path segments and keywords that signal a careers page.
|
||||
_CAREER_PATH_PATTERNS: list[re.Pattern[str]] = [
|
||||
re.compile(p, re.IGNORECASE)
|
||||
for p in [
|
||||
r"/career", # /careers, /career
|
||||
r"/job", # /jobs, /job-listings
|
||||
r"/join", # /join-us, /join-our-team
|
||||
r"/work-with", # /work-with-us
|
||||
r"/we-re-hiring",
|
||||
r"/openings?",
|
||||
r"/opportunities",
|
||||
r"/positions?",
|
||||
r"/vacancies",
|
||||
r"/hiring",
|
||||
]
|
||||
]
|
||||
|
||||
# Weighted keyword scoring for anchor text and href paths.
|
||||
# Tuples: (compiled pattern, score)
|
||||
_HREF_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
|
||||
(re.compile(r"/career", re.I), 3.0),
|
||||
(re.compile(r"/job", re.I), 2.5),
|
||||
(re.compile(r"/join", re.I), 2.0),
|
||||
(re.compile(r"/opening", re.I), 2.0),
|
||||
(re.compile(r"/position", re.I), 2.0),
|
||||
(re.compile(r"/opportunit", re.I), 1.5),
|
||||
(re.compile(r"/work.with", re.I), 1.5),
|
||||
(re.compile(r"/hiring", re.I), 1.5),
|
||||
(re.compile(r"/vacanc", re.I), 1.5),
|
||||
]
|
||||
|
||||
_TEXT_WEIGHTS: list[tuple[re.Pattern[str], float]] = [
|
||||
(re.compile(r"\bcareers?\b", re.I), 3.0),
|
||||
(re.compile(r"\bjobs?\b", re.I), 2.5),
|
||||
(re.compile(r"\bjoin\s+us\b", re.I), 2.0),
|
||||
(re.compile(r"\bopen\s+positions?\b", re.I), 2.0),
|
||||
(re.compile(r"\bwork\s+with\s+us\b", re.I), 1.5),
|
||||
(re.compile(r"\bwe.?re\s+hiring\b", re.I), 1.5),
|
||||
(re.compile(r"\bopportunities\b", re.I), 1.5),
|
||||
(re.compile(r"\bhiring\b", re.I), 1.0),
|
||||
]
|
||||
|
||||
# Minimum score to accept a homepage link as a careers page candidate.
|
||||
_SCORE_THRESHOLD = 2.0
|
||||
|
||||
# Maximum child sitemaps to fetch when processing a sitemap index.
|
||||
_MAX_SITEMAP_CHILDREN = 5
|
||||
|
||||
# Maximum <loc> entries to scan across all sitemaps.
|
||||
_MAX_LOC_ENTRIES = 500
|
||||
|
||||
# Soft-404 path indicators used to reject SPA-style error pages.
|
||||
_SOFT_404_INDICATORS = ("notfound", "/404", "not-found", "/pagenot", "/error-page")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _is_plausible_careers_url(original_url: str, final_url: str) -> bool:
|
||||
"""Return False for obvious false positives from redirect chains.
|
||||
|
||||
Rejects:
|
||||
- SPA soft-404 paths: final URL path contains "notfound", "/404", "not-found"
|
||||
(e.g. Netflix /careers → /NotFound?prev=...)
|
||||
- Off-brand cross-domain redirects: final domain shares no brand with original
|
||||
(e.g. microsoft.com/careers → bing.com)
|
||||
|
||||
Accepts legitimate cross-domain redirects where the brand is preserved
|
||||
(e.g. amazon.com → amazon.jobs).
|
||||
"""
|
||||
orig = urlsplit(original_url)
|
||||
final = urlsplit(final_url)
|
||||
|
||||
# 1. Reject soft-404 path indicators (case-insensitive).
|
||||
final_path_lower = final.path.lower()
|
||||
if any(indicator in final_path_lower for indicator in _SOFT_404_INDICATORS):
|
||||
return False
|
||||
|
||||
# 2. If domain changed, verify the brand name survives in the new host.
|
||||
orig_host = re.sub(r"^www\.", "", orig.netloc, flags=re.IGNORECASE).lower()
|
||||
final_host = re.sub(r"^www\.", "", final.netloc, flags=re.IGNORECASE).lower()
|
||||
if orig_host != final_host:
|
||||
brand = orig_host.split(".")[0] # e.g. "microsoft" from "microsoft.com"
|
||||
if len(brand) > 3 and brand not in final_host:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _base_parts(website: str) -> tuple[str, str, str]:
|
||||
"""Return (scheme, host, root_domain) for a website URL.
|
||||
|
||||
root_domain strips a leading 'www.' from the host so that subdomain
|
||||
candidates like 'careers.{root_domain}' are formed correctly.
|
||||
Example: 'https://www.acme.com/about' → ('https', 'www.acme.com', 'acme.com')
|
||||
"""
|
||||
parts = urlsplit(website)
|
||||
scheme = parts.scheme or "https"
|
||||
host = parts.netloc or parts.path.split("/")[0]
|
||||
root = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE)
|
||||
return scheme, host, root
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tier 2 — URL-pattern probing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def probe_url_patterns(website: str, client: httpx.Client) -> str | None:
|
||||
"""Probe well-known career URL paths and subdomains; return first reachable URL.
|
||||
|
||||
Probes in order:
|
||||
/careers, /career, /jobs, /join-us, /join
|
||||
careers.{root_domain}, jobs.{root_domain}
|
||||
Uses HTTP HEAD with GET fallback via http.probe_url.
|
||||
"""
|
||||
scheme, host, root = _base_parts(website)
|
||||
base = f"{scheme}://{host}"
|
||||
|
||||
candidates: list[str] = [
|
||||
f"{base}/careers",
|
||||
f"{base}/career",
|
||||
f"{base}/jobs",
|
||||
f"{base}/join-us",
|
||||
f"{base}/join",
|
||||
f"{scheme}://careers.{root}",
|
||||
f"{scheme}://jobs.{root}",
|
||||
]
|
||||
|
||||
for candidate in candidates:
|
||||
result = probe_url(client, candidate)
|
||||
if result and _is_plausible_careers_url(candidate, result):
|
||||
logger.info("heuristics.probe_url_patterns(%s): hit url=%s", website, result)
|
||||
return result
|
||||
if result:
|
||||
logger.info(
|
||||
"heuristics.probe_url_patterns(%s): rejected redirect %s → %s",
|
||||
website, candidate, result,
|
||||
)
|
||||
|
||||
logger.info("heuristics.probe_url_patterns(%s): no pattern matched", website)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tier 3 — Homepage link scan
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def scan_homepage_links(
|
||||
website: str,
|
||||
client: httpx.Client,
|
||||
*,
|
||||
homepage_html: str | None = None,
|
||||
) -> str | None:
|
||||
"""Rank <a> anchors on the homepage by career/job keywords; return best match.
|
||||
|
||||
If *homepage_html* is provided it is used directly. Otherwise the homepage
|
||||
is fetched. Returns the highest-scoring anchor href that exceeds the
|
||||
threshold, or None.
|
||||
"""
|
||||
html = homepage_html
|
||||
if html is None:
|
||||
try:
|
||||
resp = request_with_retries(client, "GET", website, max_retries=1)
|
||||
if resp.status_code >= 400:
|
||||
logger.warning(
|
||||
"heuristics.scan_homepage_links(%s): HTTP %s", website, resp.status_code
|
||||
)
|
||||
return None
|
||||
html = resp.text
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"heuristics.scan_homepage_links(%s): fetch error: %s", website, exc
|
||||
)
|
||||
return None
|
||||
|
||||
if not html:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
best_url: str | None = None
|
||||
best_score: float = 0.0
|
||||
|
||||
for tag in soup.find_all("a", href=True):
|
||||
href: str = tag["href"].strip()
|
||||
# Skip non-HTTP links and fragment-only anchors.
|
||||
if not href or href.startswith(("mailto:", "tel:", "#", "javascript:")):
|
||||
continue
|
||||
|
||||
full_url = urljoin(website, href)
|
||||
# Keep only http(s) links.
|
||||
if not full_url.startswith(("http://", "https://")):
|
||||
continue
|
||||
|
||||
score = _score_anchor(href, tag.get_text(separator=" ", strip=True))
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_url = full_url
|
||||
|
||||
if best_url and best_score >= _SCORE_THRESHOLD:
|
||||
logger.info(
|
||||
"heuristics.scan_homepage_links(%s): hit url=%s score=%.1f",
|
||||
website, best_url, best_score,
|
||||
)
|
||||
return best_url
|
||||
|
||||
logger.info(
|
||||
"heuristics.scan_homepage_links(%s): no link above threshold (best=%.1f)",
|
||||
website, best_score,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _score_anchor(href: str, text: str) -> float:
|
||||
"""Compute a relevance score for an anchor based on its href path and text."""
|
||||
score = 0.0
|
||||
for pattern, weight in _HREF_WEIGHTS:
|
||||
if pattern.search(href):
|
||||
score += weight
|
||||
for pattern, weight in _TEXT_WEIGHTS:
|
||||
if pattern.search(text):
|
||||
score += weight
|
||||
return score
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tier 4 — Sitemap
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CAREER_URL_RE = re.compile(
|
||||
r"/(career|job|join|opening|position|opportunit|vacanc|hiring)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def parse_sitemap(website: str, client: httpx.Client) -> str | None:
|
||||
"""Fetch /sitemap.xml and scan <loc> URLs for career/job keywords.
|
||||
|
||||
If the sitemap is an index, fetches up to _MAX_SITEMAP_CHILDREN child
|
||||
sitemaps. Scans up to _MAX_LOC_ENTRIES <loc> entries in total.
|
||||
Returns the first matching URL, or None.
|
||||
"""
|
||||
scheme, host, _ = _base_parts(website)
|
||||
sitemap_url = f"{scheme}://{host}/sitemap.xml"
|
||||
try:
|
||||
xml = _fetch_xml(client, sitemap_url)
|
||||
except Exception as exc:
|
||||
logger.info("heuristics.parse_sitemap(%s): fetch error: %s", website, exc)
|
||||
return None
|
||||
|
||||
if xml is None:
|
||||
logger.info("heuristics.parse_sitemap(%s): sitemap not found", website)
|
||||
return None
|
||||
|
||||
result = _scan_sitemap_xml(xml, website, client)
|
||||
if result:
|
||||
logger.info("heuristics.parse_sitemap(%s): hit url=%s", website, result)
|
||||
else:
|
||||
logger.info("heuristics.parse_sitemap(%s): no career URL found", website)
|
||||
return result
|
||||
|
||||
|
||||
def _fetch_xml(client: httpx.Client, url: str) -> str | None:
|
||||
"""GET a URL and return the text if the response is < 400, else None."""
|
||||
resp = request_with_retries(client, "GET", url, max_retries=1)
|
||||
if resp.status_code >= 400:
|
||||
return None
|
||||
return resp.text
|
||||
|
||||
|
||||
def _scan_sitemap_xml(xml: str, website: str, client: httpx.Client) -> str | None:
|
||||
"""Parse sitemap XML; handle sitemap index by fetching children."""
|
||||
soup = BeautifulSoup(xml, "xml")
|
||||
|
||||
# Sitemap index: contains <sitemap><loc>…</loc></sitemap>
|
||||
sitemap_tags = soup.find_all("sitemap")
|
||||
if sitemap_tags:
|
||||
child_locs = [
|
||||
tag.find("loc").get_text(strip=True)
|
||||
for tag in sitemap_tags
|
||||
if tag.find("loc")
|
||||
]
|
||||
for child_url in child_locs[:_MAX_SITEMAP_CHILDREN]:
|
||||
try:
|
||||
child_xml = _fetch_xml(client, child_url)
|
||||
except Exception:
|
||||
continue
|
||||
if child_xml:
|
||||
result = _scan_locs(child_xml)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
# Plain sitemap: contains <url><loc>…</loc></url>
|
||||
return _scan_locs(xml)
|
||||
|
||||
|
||||
def _scan_locs(xml: str) -> str | None:
|
||||
"""Scan <loc> elements in a sitemap for career/job keywords."""
|
||||
soup = BeautifulSoup(xml, "xml")
|
||||
count = 0
|
||||
for loc_tag in soup.find_all("loc"):
|
||||
if count >= _MAX_LOC_ENTRIES:
|
||||
break
|
||||
count += 1
|
||||
loc: str = loc_tag.get_text(strip=True)
|
||||
if _CAREER_URL_RE.search(loc):
|
||||
return loc
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user