507 lines
18 KiB
Python
507 lines
18 KiB
Python
"""ATS detection and public JSON API fetching (Stage 2, tier 1).
|
|
|
|
Detects Greenhouse / Lever / Ashby / Workday from a company homepage's HTML or
|
|
from a URL string, then calls each platform's public (no-auth) JSON API to
|
|
return both a canonical careers-page URL and the first open-position URL.
|
|
|
|
Live-verified API shapes (2026-06-17):
|
|
Greenhouse: GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs?per_page=1
|
|
→ {"jobs":[{"absolute_url":"...","company_name":"..."},...], "meta":{"total":N}}
|
|
Lever: GET https://api.lever.co/v0/postings/{slug}?mode=json&limit=1
|
|
→ [{"hostedUrl":"..."},...] (JSON array, empty list if no slug)
|
|
Ashby: GET https://api.ashbyhq.com/posting-api/job-board/{slug}
|
|
→ {"jobs":[{"jobUrl":"..."},...], "apiVersion":"..."}
|
|
NOTE: GET only — POST is NOT used (CLAUDE.md gotcha was wrong)
|
|
Workday: POST https://{host}/wday/cxs/{tenant}/{site}/jobs
|
|
body {"appliedFacets":{},"limit":1,"offset":0,"searchText":""}
|
|
→ {"total":N,"jobPostings":[{"externalPath":"/job/..."},...]}
|
|
Job URL: https://{host}/en-US/{site}{externalPath}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import urlsplit
|
|
|
|
import httpx
|
|
from pydantic import BaseModel
|
|
|
|
from ..http import build_client, request_with_retries
|
|
from ..resolve import _slug
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Detection patterns — keyed by ATS name, applied to raw HTML or URL strings.
|
|
# Each pattern yields named groups used to build board coordinates.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Greenhouse: match board embed script or iframe src or direct board links.
|
|
# Captures: slug
|
|
_GH_PATTERN = re.compile(
|
|
r"(?:"
|
|
r"boards\.greenhouse\.io/(?:embed/job_board\?for=|)(?P<slug1>[A-Za-z0-9_-]+)"
|
|
r"|job-boards\.greenhouse\.io/(?P<slug2>[A-Za-z0-9_-]+)"
|
|
r"|greenhouse\.io/embed/job_board\?for=(?P<slug3>[A-Za-z0-9_-]+)"
|
|
r")",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Lever: board is always jobs.lever.co/{slug}
|
|
# Captures: slug
|
|
_LEVER_PATTERN = re.compile(
|
|
r"jobs\.lever\.co/(?P<slug>[A-Za-z0-9_-]+)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Ashby: board is always jobs.ashbyhq.com/{slug}; slugs may be mixed-case
|
|
# Captures: slug
|
|
_ASHBY_PATTERN = re.compile(
|
|
r"jobs\.ashbyhq\.com/(?P<slug>[A-Za-z0-9_%-]+)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Workday: tenant.wdN.myworkdayjobs.com[/locale]/site
|
|
# Captures: tenant, wdnum (the N in wdN), site
|
|
# Locale portion (e.g. "en-US/") is optional and consumed but not captured.
|
|
_WORKDAY_PATTERN = re.compile(
|
|
r"(?P<tenant>[A-Za-z0-9_-]+)\.wd(?P<wdnum>\d+)\.myworkdayjobs\.com"
|
|
r"(?:/[a-z]{2}-[A-Z]{2})?" # optional locale segment
|
|
r"/(?P<site>[A-Za-z0-9_%-]+)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Ordered list used by both detect_ats_in_html and detect_ats_in_url
|
|
_ATS_CHECKS: list[tuple[str, re.Pattern[str]]] = [
|
|
("greenhouse", _GH_PATTERN),
|
|
("lever", _LEVER_PATTERN),
|
|
("ashby", _ASHBY_PATTERN),
|
|
("workday", _WORKDAY_PATTERN),
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data models
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class ATSBoard(BaseModel):
|
|
"""Coordinates for a detected ATS board (detection output)."""
|
|
|
|
ats_name: str # greenhouse | lever | ashby | workday
|
|
slug: str # board slug / company identifier
|
|
careers_url: str # canonical human-readable careers board URL
|
|
wd_host: str | None = None # workday only: full myworkdayjobs.com host
|
|
wd_tenant: str | None = None # workday only: tenant portion of host
|
|
wd_site: str | None = None # workday only: site path segment
|
|
|
|
|
|
class ATSFetch(BaseModel):
|
|
"""Per-platform API fetch result; single source of truth for response field shapes."""
|
|
|
|
first_url: str | None = None # first open-position URL from the board
|
|
job_count: int = 0 # total jobs reported by the board
|
|
org_name: str | None = None # company_name from the API (Greenhouse only)
|
|
|
|
|
|
class ATSResult(BaseModel):
|
|
"""Output of detect_and_fetch / recover_via_slug_guess: careers URL + optional first-job URL."""
|
|
|
|
ats_name: str
|
|
careers_url: str
|
|
position_url: str | None = None
|
|
job_count: int = 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def detect_ats_in_html(html: str) -> ATSBoard | None:
|
|
"""Scan page HTML for known ATS signals; return the first match or None.
|
|
|
|
Tries patterns in order: Greenhouse → Lever → Ashby → Workday.
|
|
Pure function; no network calls.
|
|
"""
|
|
for ats_name, pattern in _ATS_CHECKS:
|
|
m = pattern.search(html)
|
|
if m:
|
|
board = _build_board(ats_name, m)
|
|
if board:
|
|
return board
|
|
return None
|
|
|
|
|
|
def detect_ats_in_url(url: str) -> ATSBoard | None:
|
|
"""Scan a single URL string for ATS board coordinates; return first match or None.
|
|
|
|
Used by the cascade finalizer to recognise when a heuristic-found careers
|
|
link is itself an ATS board (e.g. jobs.lever.co/acme).
|
|
Pure function; no network calls.
|
|
"""
|
|
for ats_name, pattern in _ATS_CHECKS:
|
|
m = pattern.search(url)
|
|
if m:
|
|
board = _build_board(ats_name, m)
|
|
if board:
|
|
return board
|
|
return None
|
|
|
|
|
|
def _build_board(ats_name: str, m: re.Match[str]) -> ATSBoard | None:
|
|
"""Construct an ATSBoard from a regex match; return None if slug is empty."""
|
|
if ats_name == "greenhouse":
|
|
slug = m.group("slug1") or m.group("slug2") or m.group("slug3") or ""
|
|
slug = slug.strip()
|
|
if not slug:
|
|
return None
|
|
return ATSBoard(
|
|
ats_name="greenhouse",
|
|
slug=slug,
|
|
careers_url=f"https://boards.greenhouse.io/{slug}",
|
|
)
|
|
|
|
if ats_name == "lever":
|
|
slug = (m.group("slug") or "").strip()
|
|
if not slug:
|
|
return None
|
|
return ATSBoard(
|
|
ats_name="lever",
|
|
slug=slug,
|
|
careers_url=f"https://jobs.lever.co/{slug}",
|
|
)
|
|
|
|
if ats_name == "ashby":
|
|
slug = (m.group("slug") or "").strip()
|
|
if not slug:
|
|
return None
|
|
return ATSBoard(
|
|
ats_name="ashby",
|
|
slug=slug,
|
|
careers_url=f"https://jobs.ashbyhq.com/{slug}",
|
|
)
|
|
|
|
if ats_name == "workday":
|
|
tenant = (m.group("tenant") or "").strip()
|
|
wdnum = (m.group("wdnum") or "").strip()
|
|
site = (m.group("site") or "").strip()
|
|
if not (tenant and wdnum and site):
|
|
return None
|
|
host = f"{tenant}.wd{wdnum}.myworkdayjobs.com"
|
|
return ATSBoard(
|
|
ats_name="workday",
|
|
slug=f"{tenant}/{site}",
|
|
careers_url=f"https://{host}/en-US/{site}",
|
|
wd_host=host,
|
|
wd_tenant=tenant,
|
|
wd_site=site,
|
|
)
|
|
|
|
return None # pragma: no cover
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-ATS fetch functions — each returns ATSFetch
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _fetch_greenhouse(board: ATSBoard, client: httpx.Client) -> ATSFetch:
|
|
"""Call Greenhouse boards API; return ATSFetch with first_url, job_count, org_name."""
|
|
url = f"https://boards-api.greenhouse.io/v1/boards/{board.slug}/jobs?per_page=1"
|
|
try:
|
|
resp = request_with_retries(client, "GET", url, max_retries=2)
|
|
if resp.status_code != 200:
|
|
logger.warning("greenhouse(%s): HTTP %s", board.slug, resp.status_code)
|
|
return ATSFetch()
|
|
data = resp.json()
|
|
jobs = data.get("jobs") or []
|
|
count = (data.get("meta") or {}).get("total", len(jobs))
|
|
first_url = jobs[0].get("absolute_url") if jobs else None
|
|
org_name = jobs[0].get("company_name") if jobs else None
|
|
return ATSFetch(first_url=first_url, job_count=count, org_name=org_name)
|
|
except Exception as exc:
|
|
logger.warning("greenhouse(%s): fetch error: %s", board.slug, exc)
|
|
return ATSFetch()
|
|
|
|
|
|
def _fetch_lever(board: ATSBoard, client: httpx.Client) -> ATSFetch:
|
|
"""Call Lever postings API; return ATSFetch with first_url and job_count."""
|
|
url = f"https://api.lever.co/v0/postings/{board.slug}?mode=json&limit=1"
|
|
try:
|
|
resp = request_with_retries(client, "GET", url, max_retries=2)
|
|
if resp.status_code != 200:
|
|
logger.warning("lever(%s): HTTP %s", board.slug, resp.status_code)
|
|
return ATSFetch()
|
|
data = resp.json()
|
|
if not isinstance(data, list):
|
|
logger.warning("lever(%s): unexpected response type %s", board.slug, type(data))
|
|
return ATSFetch()
|
|
first_url = data[0].get("hostedUrl") if data else None
|
|
return ATSFetch(first_url=first_url, job_count=len(data))
|
|
except Exception as exc:
|
|
logger.warning("lever(%s): fetch error: %s", board.slug, exc)
|
|
return ATSFetch()
|
|
|
|
|
|
def _fetch_ashby(board: ATSBoard, client: httpx.Client) -> ATSFetch:
|
|
"""Call Ashby job-board API (GET); return ATSFetch with first_url and job_count."""
|
|
url = f"https://api.ashbyhq.com/posting-api/job-board/{board.slug}"
|
|
try:
|
|
resp = request_with_retries(client, "GET", url, max_retries=2)
|
|
if resp.status_code != 200:
|
|
logger.warning("ashby(%s): HTTP %s", board.slug, resp.status_code)
|
|
return ATSFetch()
|
|
data = resp.json()
|
|
jobs = data.get("jobs") or []
|
|
first_url = jobs[0].get("jobUrl") if jobs else None
|
|
return ATSFetch(first_url=first_url, job_count=len(jobs))
|
|
except Exception as exc:
|
|
logger.warning("ashby(%s): fetch error: %s", board.slug, exc)
|
|
return ATSFetch()
|
|
|
|
|
|
def _fetch_workday(board: ATSBoard, client: httpx.Client) -> ATSFetch:
|
|
"""Call Workday CXS jobs endpoint (POST); return ATSFetch with first_url and total count."""
|
|
if not (board.wd_host and board.wd_tenant and board.wd_site):
|
|
return ATSFetch()
|
|
api_url = (
|
|
f"https://{board.wd_host}/wday/cxs/{board.wd_tenant}/{board.wd_site}/jobs"
|
|
)
|
|
payload = {"appliedFacets": {}, "limit": 1, "offset": 0, "searchText": ""}
|
|
try:
|
|
resp = request_with_retries(
|
|
client, "POST", api_url,
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
max_retries=2,
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.warning("workday(%s): HTTP %s", board.slug, resp.status_code)
|
|
return ATSFetch()
|
|
data = resp.json()
|
|
total = data.get("total", 0)
|
|
postings = data.get("jobPostings") or []
|
|
first_url: str | None = None
|
|
if postings:
|
|
ext_path = postings[0].get("externalPath") or ""
|
|
if ext_path:
|
|
first_url = (
|
|
f"https://{board.wd_host}/en-US/{board.wd_site}{ext_path}"
|
|
)
|
|
return ATSFetch(first_url=first_url, job_count=total)
|
|
except Exception as exc:
|
|
logger.warning("workday(%s): fetch error: %s", board.slug, exc)
|
|
return ATSFetch()
|
|
|
|
|
|
_FETCH_DISPATCH: dict[str, object] = {
|
|
"greenhouse": _fetch_greenhouse,
|
|
"lever": _fetch_lever,
|
|
"ashby": _fetch_ashby,
|
|
"workday": _fetch_workday,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public orchestrator — HTML-detection path (Tier 1)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def detect_and_fetch(
|
|
website: str,
|
|
client: httpx.Client,
|
|
*,
|
|
homepage_html: str | None = None,
|
|
) -> ATSResult | None:
|
|
"""Detect the ATS for a company website and fetch the first job via its API.
|
|
|
|
If *homepage_html* is provided it is used directly (avoids a redundant GET).
|
|
Returns an ATSResult on success (position_url may be None if the board has
|
|
no live jobs), or None if no ATS was detected or the homepage fetch failed.
|
|
"""
|
|
html = homepage_html
|
|
if html is None:
|
|
try:
|
|
resp = request_with_retries(client, "GET", website, max_retries=1)
|
|
if resp.status_code < 400:
|
|
html = resp.text
|
|
except Exception as exc:
|
|
logger.warning("ats detect_and_fetch(%s): homepage fetch error: %s", website, exc)
|
|
return None
|
|
|
|
if not html:
|
|
return None
|
|
|
|
board = detect_ats_in_html(html)
|
|
if board is None:
|
|
return None
|
|
|
|
logger.info(
|
|
"ats(%s): detected %s slug=%s careers_url=%s",
|
|
website, board.ats_name, board.slug, board.careers_url,
|
|
)
|
|
|
|
fetch_fn = _FETCH_DISPATCH.get(board.ats_name)
|
|
if fetch_fn is None: # pragma: no cover
|
|
return ATSResult(ats_name=board.ats_name, careers_url=board.careers_url)
|
|
|
|
fetch = fetch_fn(board, client) # type: ignore[operator]
|
|
logger.info(
|
|
"ats(%s): %s board has %s jobs; first_url=%s",
|
|
website, board.ats_name, fetch.job_count, fetch.first_url,
|
|
)
|
|
return ATSResult(
|
|
ats_name=board.ats_name,
|
|
careers_url=board.careers_url,
|
|
position_url=fetch.first_url,
|
|
job_count=fetch.job_count,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Slug-guess recovery helpers — Tier 1b (JS-embedded / SPA boards)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _domain_stem(website: str) -> str | None:
|
|
"""Extract the first DNS label (lowercased) from a website URL, stripping www.
|
|
|
|
Example: 'https://www.anthropic.com' → 'anthropic'
|
|
"""
|
|
try:
|
|
parts = urlsplit(website)
|
|
host = parts.netloc or parts.path.split("/")[0]
|
|
host = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE).lower()
|
|
stem = host.split(".")[0]
|
|
return stem or None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _slug_candidates(website: str, company_name: str | None) -> list[str]:
|
|
"""Return ordered, deduped ATS slug candidates (≤ 3) for a company.
|
|
|
|
Order: domain stem first (most specific/unique), then normalized company name.
|
|
"""
|
|
candidates: list[str] = []
|
|
seen: set[str] = set()
|
|
|
|
stem = _domain_stem(website)
|
|
if stem and stem not in seen:
|
|
candidates.append(stem)
|
|
seen.add(stem)
|
|
|
|
if company_name:
|
|
name_slug = _slug(company_name)
|
|
if name_slug and name_slug not in seen:
|
|
candidates.append(name_slug)
|
|
seen.add(name_slug)
|
|
|
|
return candidates[:3]
|
|
|
|
|
|
def _board_from_slug(ats_name: str, slug: str) -> ATSBoard:
|
|
"""Construct an ATSBoard for Greenhouse / Lever / Ashby slug-guess probing."""
|
|
if ats_name == "greenhouse":
|
|
return ATSBoard(
|
|
ats_name="greenhouse",
|
|
slug=slug,
|
|
careers_url=f"https://boards.greenhouse.io/{slug}",
|
|
)
|
|
if ats_name == "lever":
|
|
return ATSBoard(
|
|
ats_name="lever",
|
|
slug=slug,
|
|
careers_url=f"https://jobs.lever.co/{slug}",
|
|
)
|
|
# ashby
|
|
return ATSBoard(
|
|
ats_name="ashby",
|
|
slug=slug,
|
|
careers_url=f"https://jobs.ashbyhq.com/{slug}",
|
|
)
|
|
|
|
|
|
def _loose_name_match(input_name: str, org_name: str) -> bool:
|
|
"""Return True if company names loosely match (slug of one is a substring of the other).
|
|
|
|
If either side normalizes to empty, returns True (unverifiable → don't reject).
|
|
"""
|
|
a = _slug(input_name) or ""
|
|
b = _slug(org_name) or ""
|
|
if not a or not b:
|
|
return True
|
|
return a in b or b in a
|
|
|
|
|
|
# Platforms probed in slug-guess recovery (Workday excluded — needs tenant+site).
|
|
_SLUG_GUESS_PLATFORMS = ("greenhouse", "lever", "ashby")
|
|
|
|
|
|
def recover_via_slug_guess(
|
|
website: str,
|
|
company_name: str | None,
|
|
client: httpx.Client,
|
|
) -> ATSResult | None:
|
|
"""Probe Greenhouse/Lever/Ashby APIs with guessed slugs when HTML detection misses.
|
|
|
|
Used as Tier 1b in the cascade — catches companies whose ATS board is
|
|
injected via client-side JS and therefore invisible to static HTML detection
|
|
(e.g. Anthropic's Greenhouse board rendered by Next.js).
|
|
|
|
Slug candidates: domain stem first (e.g. anthropic.com → 'anthropic'), then
|
|
the normalized company name. First board with job_count > 0 wins.
|
|
|
|
False-positive guard: if Greenhouse returns a company_name, it is loosely
|
|
cross-checked against the input company_name; a clear mismatch is rejected.
|
|
|
|
Returns ATSResult on success or None on all-miss.
|
|
"""
|
|
candidates = _slug_candidates(website, company_name)
|
|
if not candidates:
|
|
return None
|
|
|
|
for slug in candidates:
|
|
for ats_name in _SLUG_GUESS_PLATFORMS:
|
|
board = _board_from_slug(ats_name, slug)
|
|
fetch_fn = _FETCH_DISPATCH.get(ats_name)
|
|
if fetch_fn is None: # pragma: no cover
|
|
continue
|
|
try:
|
|
fetch = fetch_fn(board, client) # type: ignore[operator]
|
|
except Exception as exc:
|
|
logger.debug(
|
|
"recover_via_slug_guess(%s/%s/%s): error: %s",
|
|
website, ats_name, slug, exc,
|
|
)
|
|
continue
|
|
|
|
if not fetch.first_url or fetch.job_count == 0:
|
|
continue
|
|
|
|
# Cross-check org name if the platform provides it (Greenhouse only).
|
|
if company_name and fetch.org_name:
|
|
if not _loose_name_match(company_name, fetch.org_name):
|
|
logger.info(
|
|
"recover_via_slug_guess(%s): %s slug=%s org_name=%r "
|
|
"does not match input %r — skipping",
|
|
website, ats_name, slug, fetch.org_name, company_name,
|
|
)
|
|
continue
|
|
|
|
logger.info(
|
|
"recover_via_slug_guess(%s): hit %s slug=%s jobs=%s careers_url=%s",
|
|
website, ats_name, slug, fetch.job_count, board.careers_url,
|
|
)
|
|
return ATSResult(
|
|
ats_name=ats_name,
|
|
careers_url=board.careers_url,
|
|
position_url=fetch.first_url,
|
|
job_count=fetch.job_count,
|
|
)
|
|
|
|
logger.info("recover_via_slug_guess(%s): all candidates missed", website)
|
|
return None
|