Files

235 lines
9.0 KiB
Python

"""find_careers_page(): orchestrate the Stage 2 tier cascade.
Cascade order (return early on first success):
1. ATS detection → ats.detect_and_fetch() confidence 0.95
2. URL patterns → heuristics.probe_url_patterns() 0.80
3. Homepage scan → heuristics.scan_homepage_links() 0.60
4. Sitemap → heuristics.parse_sitemap() 0.50
5. Cheap-LLM → classify_llm (stub, not implemented in this phase)
6. Browser agent → agent_fallback (stub, not implemented in this phase)
Returns a CareersResult with the URL, confidence, method string, and — when
the ATS tier resolves — the first open-position URL for free (Stage-3 shortcut).
The optional *client* parameter follows the managed-client pattern from
resolve.py: supply an existing httpx.Client to reuse connections; otherwise a
short-lived client is created and closed here.
"""
from __future__ import annotations
import logging
import httpx
from pydantic import BaseModel
from ..http import build_client, request_with_retries
from . import ats as _ats
from . import heuristics as _heuristics
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Result model
# ---------------------------------------------------------------------------
class CareersResult(BaseModel):
"""Typed output of find_careers_page()."""
careers_url: str | None = None
confidence: float = 0.0
# method values: "ats:{name}" | "url_pattern" | "homepage_scan" | "sitemap" | "none"
method: str = "none"
ats_name: str | None = None
# Free Stage-3 shortcut: populated when ATS tier resolves (first open job URL).
position_url: str | None = None
# ---------------------------------------------------------------------------
# Public entry point
# ---------------------------------------------------------------------------
def find_careers_page(
website: str,
*,
company_name: str | None = None,
client: httpx.Client | None = None,
) -> CareersResult:
"""Run the careers-page discovery cascade for one company website.
*company_name* is optional; when provided it supplies a second slug candidate
for the Tier 1b slug-guess recovery and enables the org-name cross-check.
Returns a CareersResult. Never raises — tier failures fall through gracefully.
"""
_managed = client is None
if _managed:
client = build_client()
try:
# Fetch the homepage once; shared by ATS detection and homepage-link scan.
homepage_html: str | None = _safe_get_html(website, client)
# ------------------------------------------------------------------
# Tier 1 — ATS detection + public JSON API
# ------------------------------------------------------------------
try:
ats_result = _ats.detect_and_fetch(
website, client, homepage_html=homepage_html
)
if ats_result is not None:
logger.info(
"cascade(%s): resolved via ats:%s careers_url=%s",
website, ats_result.ats_name, ats_result.careers_url,
)
return CareersResult(
careers_url=ats_result.careers_url,
confidence=0.95,
method=f"ats:{ats_result.ats_name}",
ats_name=ats_result.ats_name,
position_url=ats_result.position_url,
)
except Exception as exc:
logger.warning("cascade(%s): ats tier error: %s", website, exc)
# ------------------------------------------------------------------
# Tier 1b — Slug-guess ATS recovery (JS-embedded / SPA boards)
# ------------------------------------------------------------------
try:
rec = _ats.recover_via_slug_guess(website, company_name, client)
if rec is not None:
logger.info(
"cascade(%s): resolved via ats:%s:slug_guess careers_url=%s",
website, rec.ats_name, rec.careers_url,
)
return CareersResult(
careers_url=rec.careers_url,
confidence=0.90,
method=f"ats:{rec.ats_name}:slug_guess",
ats_name=rec.ats_name,
position_url=rec.position_url,
)
except Exception as exc:
logger.warning("cascade(%s): slug_guess tier error: %s", website, exc)
# ------------------------------------------------------------------
# Tier 2 — URL-pattern probing
# ------------------------------------------------------------------
try:
url = _heuristics.probe_url_patterns(website, client)
if url:
return _finalize(url, "url_pattern", 0.80, website, client)
except Exception as exc:
logger.warning("cascade(%s): url_pattern tier error: %s", website, exc)
# ------------------------------------------------------------------
# Tier 3 — Homepage link scan (reuse fetched HTML)
# ------------------------------------------------------------------
try:
url = _heuristics.scan_homepage_links(
website, client, homepage_html=homepage_html
)
if url:
return _finalize(url, "homepage_scan", 0.60, website, client)
except Exception as exc:
logger.warning("cascade(%s): homepage_scan tier error: %s", website, exc)
# ------------------------------------------------------------------
# Tier 4 — Sitemap
# ------------------------------------------------------------------
try:
url = _heuristics.parse_sitemap(website, client)
if url:
return _finalize(url, "sitemap", 0.50, website, client)
except Exception as exc:
logger.warning("cascade(%s): sitemap tier error: %s", website, exc)
# All deterministic tiers missed.
logger.info("cascade(%s): all deterministic tiers missed", website)
return CareersResult()
finally:
if _managed:
client.close()
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _detect_ats_in_page(url: str, client: httpx.Client) -> "_ats.ATSBoard | None":
"""Fetch a page and run ATS detection on its HTML; return ATSBoard or None."""
try:
resp = request_with_retries(client, "GET", url, max_retries=0)
if resp.status_code < 400:
return _ats.detect_ats_in_html(resp.text)
except Exception:
pass
return None
def _safe_get_html(website: str, client: httpx.Client) -> str | None:
"""Best-effort homepage fetch; return HTML text or None on any failure."""
try:
resp = request_with_retries(client, "GET", website, max_retries=1)
if resp.status_code < 400:
return resp.text
logger.info("cascade: homepage GET %s returned HTTP %s", website, resp.status_code)
return None
except Exception as exc:
logger.info("cascade: homepage GET %s error: %s", website, exc)
return None
def _finalize(
url: str,
method: str,
confidence: float,
website: str,
client: httpx.Client,
) -> CareersResult:
"""Attempt ATS-URL upgrade for heuristic hits; return a CareersResult.
If the URL resolved by a heuristic tier is itself an ATS board page
(e.g. jobs.lever.co/acme), detect and fetch it so we can return a
position_url and upgrade the confidence to 0.95.
"""
board = _ats.detect_ats_in_url(url)
if board is None:
# The URL itself isn't an ATS board link; fetch the page and check its HTML.
# This catches companies like Vercel/Figma whose Greenhouse embed is only on /careers.
board = _detect_ats_in_page(url, client)
if board is not None:
try:
fetch_fn = _ats._FETCH_DISPATCH.get(board.ats_name)
if fetch_fn is not None:
fetch = fetch_fn(board, client) # type: ignore[operator]
upgraded_method = f"ats:{board.ats_name}"
logger.info(
"cascade(%s): %s hit upgraded to %s careers_url=%s",
website, method, upgraded_method, board.careers_url,
)
return CareersResult(
careers_url=board.careers_url,
confidence=0.95,
method=upgraded_method,
ats_name=board.ats_name,
position_url=fetch.first_url,
)
except Exception as exc:
logger.warning(
"cascade(%s): ats upgrade for %s failed: %s", website, method, exc
)
logger.info(
"cascade(%s): resolved via %s careers_url=%s confidence=%.2f",
website, method, url, confidence,
)
return CareersResult(
careers_url=url,
confidence=confidence,
method=method,
)