235 lines
9.0 KiB
Python
235 lines
9.0 KiB
Python
"""find_careers_page(): orchestrate the Stage 2 tier cascade.
|
|
|
|
Cascade order (return early on first success):
|
|
1. ATS detection → ats.detect_and_fetch() confidence 0.95
|
|
2. URL patterns → heuristics.probe_url_patterns() 0.80
|
|
3. Homepage scan → heuristics.scan_homepage_links() 0.60
|
|
4. Sitemap → heuristics.parse_sitemap() 0.50
|
|
5. Cheap-LLM → classify_llm (stub, not implemented in this phase)
|
|
6. Browser agent → agent_fallback (stub, not implemented in this phase)
|
|
|
|
Returns a CareersResult with the URL, confidence, method string, and — when
|
|
the ATS tier resolves — the first open-position URL for free (Stage-3 shortcut).
|
|
|
|
The optional *client* parameter follows the managed-client pattern from
|
|
resolve.py: supply an existing httpx.Client to reuse connections; otherwise a
|
|
short-lived client is created and closed here.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
import httpx
|
|
from pydantic import BaseModel
|
|
|
|
from ..http import build_client, request_with_retries
|
|
from . import ats as _ats
|
|
from . import heuristics as _heuristics
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Result model
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class CareersResult(BaseModel):
|
|
"""Typed output of find_careers_page()."""
|
|
|
|
careers_url: str | None = None
|
|
confidence: float = 0.0
|
|
# method values: "ats:{name}" | "url_pattern" | "homepage_scan" | "sitemap" | "none"
|
|
method: str = "none"
|
|
ats_name: str | None = None
|
|
# Free Stage-3 shortcut: populated when ATS tier resolves (first open job URL).
|
|
position_url: str | None = None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def find_careers_page(
|
|
website: str,
|
|
*,
|
|
company_name: str | None = None,
|
|
client: httpx.Client | None = None,
|
|
) -> CareersResult:
|
|
"""Run the careers-page discovery cascade for one company website.
|
|
|
|
*company_name* is optional; when provided it supplies a second slug candidate
|
|
for the Tier 1b slug-guess recovery and enables the org-name cross-check.
|
|
Returns a CareersResult. Never raises — tier failures fall through gracefully.
|
|
"""
|
|
_managed = client is None
|
|
if _managed:
|
|
client = build_client()
|
|
|
|
try:
|
|
# Fetch the homepage once; shared by ATS detection and homepage-link scan.
|
|
homepage_html: str | None = _safe_get_html(website, client)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tier 1 — ATS detection + public JSON API
|
|
# ------------------------------------------------------------------
|
|
try:
|
|
ats_result = _ats.detect_and_fetch(
|
|
website, client, homepage_html=homepage_html
|
|
)
|
|
if ats_result is not None:
|
|
logger.info(
|
|
"cascade(%s): resolved via ats:%s careers_url=%s",
|
|
website, ats_result.ats_name, ats_result.careers_url,
|
|
)
|
|
return CareersResult(
|
|
careers_url=ats_result.careers_url,
|
|
confidence=0.95,
|
|
method=f"ats:{ats_result.ats_name}",
|
|
ats_name=ats_result.ats_name,
|
|
position_url=ats_result.position_url,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("cascade(%s): ats tier error: %s", website, exc)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tier 1b — Slug-guess ATS recovery (JS-embedded / SPA boards)
|
|
# ------------------------------------------------------------------
|
|
try:
|
|
rec = _ats.recover_via_slug_guess(website, company_name, client)
|
|
if rec is not None:
|
|
logger.info(
|
|
"cascade(%s): resolved via ats:%s:slug_guess careers_url=%s",
|
|
website, rec.ats_name, rec.careers_url,
|
|
)
|
|
return CareersResult(
|
|
careers_url=rec.careers_url,
|
|
confidence=0.90,
|
|
method=f"ats:{rec.ats_name}:slug_guess",
|
|
ats_name=rec.ats_name,
|
|
position_url=rec.position_url,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("cascade(%s): slug_guess tier error: %s", website, exc)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tier 2 — URL-pattern probing
|
|
# ------------------------------------------------------------------
|
|
try:
|
|
url = _heuristics.probe_url_patterns(website, client)
|
|
if url:
|
|
return _finalize(url, "url_pattern", 0.80, website, client)
|
|
except Exception as exc:
|
|
logger.warning("cascade(%s): url_pattern tier error: %s", website, exc)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tier 3 — Homepage link scan (reuse fetched HTML)
|
|
# ------------------------------------------------------------------
|
|
try:
|
|
url = _heuristics.scan_homepage_links(
|
|
website, client, homepage_html=homepage_html
|
|
)
|
|
if url:
|
|
return _finalize(url, "homepage_scan", 0.60, website, client)
|
|
except Exception as exc:
|
|
logger.warning("cascade(%s): homepage_scan tier error: %s", website, exc)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tier 4 — Sitemap
|
|
# ------------------------------------------------------------------
|
|
try:
|
|
url = _heuristics.parse_sitemap(website, client)
|
|
if url:
|
|
return _finalize(url, "sitemap", 0.50, website, client)
|
|
except Exception as exc:
|
|
logger.warning("cascade(%s): sitemap tier error: %s", website, exc)
|
|
|
|
# All deterministic tiers missed.
|
|
logger.info("cascade(%s): all deterministic tiers missed", website)
|
|
return CareersResult()
|
|
|
|
finally:
|
|
if _managed:
|
|
client.close()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _detect_ats_in_page(url: str, client: httpx.Client) -> "_ats.ATSBoard | None":
|
|
"""Fetch a page and run ATS detection on its HTML; return ATSBoard or None."""
|
|
try:
|
|
resp = request_with_retries(client, "GET", url, max_retries=0)
|
|
if resp.status_code < 400:
|
|
return _ats.detect_ats_in_html(resp.text)
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _safe_get_html(website: str, client: httpx.Client) -> str | None:
|
|
"""Best-effort homepage fetch; return HTML text or None on any failure."""
|
|
try:
|
|
resp = request_with_retries(client, "GET", website, max_retries=1)
|
|
if resp.status_code < 400:
|
|
return resp.text
|
|
logger.info("cascade: homepage GET %s returned HTTP %s", website, resp.status_code)
|
|
return None
|
|
except Exception as exc:
|
|
logger.info("cascade: homepage GET %s error: %s", website, exc)
|
|
return None
|
|
|
|
|
|
def _finalize(
|
|
url: str,
|
|
method: str,
|
|
confidence: float,
|
|
website: str,
|
|
client: httpx.Client,
|
|
) -> CareersResult:
|
|
"""Attempt ATS-URL upgrade for heuristic hits; return a CareersResult.
|
|
|
|
If the URL resolved by a heuristic tier is itself an ATS board page
|
|
(e.g. jobs.lever.co/acme), detect and fetch it so we can return a
|
|
position_url and upgrade the confidence to 0.95.
|
|
"""
|
|
board = _ats.detect_ats_in_url(url)
|
|
if board is None:
|
|
# The URL itself isn't an ATS board link; fetch the page and check its HTML.
|
|
# This catches companies like Vercel/Figma whose Greenhouse embed is only on /careers.
|
|
board = _detect_ats_in_page(url, client)
|
|
if board is not None:
|
|
try:
|
|
fetch_fn = _ats._FETCH_DISPATCH.get(board.ats_name)
|
|
if fetch_fn is not None:
|
|
fetch = fetch_fn(board, client) # type: ignore[operator]
|
|
upgraded_method = f"ats:{board.ats_name}"
|
|
logger.info(
|
|
"cascade(%s): %s hit upgraded to %s careers_url=%s",
|
|
website, method, upgraded_method, board.careers_url,
|
|
)
|
|
return CareersResult(
|
|
careers_url=board.careers_url,
|
|
confidence=0.95,
|
|
method=upgraded_method,
|
|
ats_name=board.ats_name,
|
|
position_url=fetch.first_url,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"cascade(%s): ats upgrade for %s failed: %s", website, method, exc
|
|
)
|
|
|
|
logger.info(
|
|
"cascade(%s): resolved via %s careers_url=%s confidence=%.2f",
|
|
website, method, url, confidence,
|
|
)
|
|
return CareersResult(
|
|
careers_url=url,
|
|
confidence=confidence,
|
|
method=method,
|
|
)
|