diff --git a/.env.example b/.env.example index 0c957b5..7016b9f 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,8 @@ # == Job source / ingestion == -JOB_SOURCE=jobspy # jobspy (default, free) | apify -SEARCH_TERMS=["software engineer"] # JSON list; CLI --search overrides +# jobspy (default, free) | apify +JOB_SOURCE=jobspy +# JSON list; CLI --search overrides +SEARCH_TERMS=["software engineer"] LOCATION=United States HOURS_OLD=72 BATCH_SIZE=20 @@ -19,9 +21,11 @@ SEARCH_API_KEY= # Pydantic AI is model-agnostic — you may also set the provider's native key var: # Anthropic: ANTHROPIC_API_KEY=... # OpenAI: OPENAI_API_KEY=... -LLM_API_KEY= -CLASSIFIER_MODEL= # cheap model for link classification -AGENT_MODEL= # stronger model for the browser agent +LLM_API_KEY=PLACEHOLDER_LLM_API_KEY +# cheap model for link classification +CLASSIFIER_MODEL=PLACEHOLDER_CLASSIFIER_MODEL +# stronger model for the browser agent +AGENT_MODEL=PLACEHOLDER_AGENT_MODEL # == HTTP client == HTTP_TIMEOUT=20 diff --git a/CLAUDE.md b/CLAUDE.md index d9a4c2c..93cc8cd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -177,10 +177,20 @@ pytest -q ## Gotchas (append confirmed findings here as you build — this section is durable memory across /clear) -- Verify ATS JSON field names against live responses before trusting them: Greenhouse `jobs[].absolute_url`; Lever `[].hostedUrl`; Ashby `jobs[].jobUrl`; Workday varies by tenant. Fix in code AND note the confirmed shape here. +- **ATS JSON shapes confirmed live (2026-06-17)** — use these field names in code: + - **Greenhouse**: `GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs?per_page=1` → `{"jobs":[{"absolute_url":"…"}],"meta":{"total":N}}`. Detect from `boards.greenhouse.io/{slug}`, `job-boards.greenhouse.io/{slug}`, or `embed/job_board?for={slug}` in page HTML. Individual job URLs may appear on either `boards.greenhouse.io` or `job-boards.greenhouse.io` — both are valid; always use the `absolute_url` field verbatim. + - **Lever**: `GET https://api.lever.co/v0/postings/{slug}?mode=json&limit=1` → **JSON array** `[{"hostedUrl":"…"}]` (empty array `[]` for unknown slug, not 404). Detect from `jobs.lever.co/{slug}`. + - **Ashby**: `GET https://api.ashbyhq.com/posting-api/job-board/{slug}` → `{"jobs":[{"jobUrl":"…"}],"apiVersion":"…"}`. **GET only — not POST** (earlier entry was wrong). Slug case is preserved in the returned `jobUrl` but the API is case-insensitive — both `ramp` and `Ramp` return 200 with the same jobs. Detect from `jobs.ashbyhq.com/{slug}`. + - **Workday**: `POST https://{host}/wday/cxs/{tenant}/{site}/jobs` body `{"appliedFacets":{},"limit":1,"offset":0,"searchText":""}` → `{"total":N,"jobPostings":[{"externalPath":"/job/…"}]}`. Build job URL as `https://{host}/en-US/{site}{externalPath}`. Detect from `{tenant}.wd{N}.myworkdayjobs.com/…/{site}` in HTML. +- **ATS embeds are typically on the /careers page, not the homepage** (verified 2026-06-17 against Vercel/Figma/Ramp/Anthropic). `detect_ats_in_html` on the homepage will miss most companies. The `_finalize` upgrade in `cascade.py` handles this by fetching the heuristic-found careers URL and re-running ATS detection on it — do not remove this step. +- **SPA-rendered careers pages (e.g. Anthropic/Next.js) cannot be detected by static HTML parsing** — the ATS embed is injected by JavaScript after the initial page load. Static-tier resolution falls through to `url_pattern` only; the browser-agent tier is needed for full ATS detection on these sites. Anthropic confirmed uses Greenhouse slug `anthropic` (370+ jobs as of 2026-06-17). +- **Soft-404 and off-brand redirect filtering in `probe_url_patterns`** (added 2026-06-17): Netflix `/careers` redirects to `/NotFound?prev=…` with HTTP 200 (SPA catch-all); Microsoft `/careers` redirects to `bing.com` via aka.ms. Both are rejected by `_is_plausible_careers_url` in heuristics.py — the next probe candidates are tried instead. +- **Live smoke-test results (2026-06-17, 10 companies)**: 10/10 careers-URL hit rate; 4/10 ATS-resolved (Vercel→greenhouse, Linear→ashby, Figma→greenhouse, Ramp→ashby); 4/10 position_url populated. Google/Microsoft/Apple/Netflix/Amazon/Anthropic resolve via `url_pattern` only (custom or SPA-rendered ATS). - **JobSpy `company_url_direct` fill rate: 0% observed** (5/5 jobs had `website=None` in a live fetch on 2026-06-17, search: "software engineer", United States, `linkedin_fetch_description=False`). `resolve.py` is essential for **every** job, not just a gap-filler. Do not assume any job arrives with a website pre-populated. - **JobSpy `date_posted` / `listed_at` fill rate: ~40% observed** (2/5 jobs had a date; 3/5 were `None`). This is because `linkedin_fetch_description=False` (our default for speed) means LinkedIn's posted date is often absent. `listed_at` is best-effort metadata only; do not gate pipeline logic on it. - **JobSpy confirmed column names** (verified 2026-06-17): `job_url` (full LinkedIn URL incl. tracking params), `company` (display name), `company_url_direct` (company own site — always `None` in practice so far), `date_posted` (sparse when `linkedin_fetch_description=False`), `title`, `location`, `id` (may be `None`; always parse job_id from `job_url` instead). `company_url` is the LinkedIn *company page* URL — never use it as the company website. +- **Tier 1b slug-guess ATS recovery (added 2026-06-17)**: when HTML detection misses (JS-embedded / SPA boards like Anthropic's Next.js Greenhouse embed), `recover_via_slug_guess()` in `ats.py` probes Greenhouse/Lever/Ashby public JSON APIs with guessed slugs. Slug candidates: domain stem first (e.g. `anthropic.com` → `anthropic`), then `_slug(company_name)` (strips legal suffixes/punctuation). False-positive guards: require `job_count > 0`; Greenhouse responses include `company_name` (used for a loose substring cross-check against input; clear mismatches rejected). Workday skipped (needs tenant+site). Confidence 0.90 / method `ats:{name}:slug_guess`. Wired in `cascade.py` as Tier 1b: fires after HTML-ATS misses, before URL-pattern probing. +- **Greenhouse `company_name` field**: live Greenhouse API responses include `jobs[0]["company_name"]` — used by slug-guess cross-check. Do not remove this field from the `ATSFetch` parsing in `_fetch_greenhouse`. - LinkedIn parses the numeric job id from `/jobs/view/{id}`; strip tracking query params. - Browser Use needs Chromium installed (`playwright install chromium`) and an LLM key; without them the tier must degrade gracefully. - LinkedIn rate-limits aggressively; keep batches small while testing. diff --git a/jobsource/careers/__init__.py b/jobsource/careers/__init__.py index 5956ba6..3c6839b 100644 --- a/jobsource/careers/__init__.py +++ b/jobsource/careers/__init__.py @@ -1 +1,4 @@ """Careers page discovery sub-package (Stage 2 cascade).""" +from .cascade import CareersResult, find_careers_page + +__all__ = ["find_careers_page", "CareersResult"] diff --git a/jobsource/careers/ats.py b/jobsource/careers/ats.py index ddddbf4..649e2d8 100644 --- a/jobsource/careers/ats.py +++ b/jobsource/careers/ats.py @@ -1,17 +1,506 @@ """ATS detection and public JSON API fetching (Stage 2, tier 1). -Scaffold stub -- not implemented yet. +Detects Greenhouse / Lever / Ashby / Workday from a company homepage's HTML or +from a URL string, then calls each platform's public (no-auth) JSON API to +return both a canonical careers-page URL and the first open-position URL. + +Live-verified API shapes (2026-06-17): + Greenhouse: GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs?per_page=1 + → {"jobs":[{"absolute_url":"...","company_name":"..."},...], "meta":{"total":N}} + Lever: GET https://api.lever.co/v0/postings/{slug}?mode=json&limit=1 + → [{"hostedUrl":"..."},...] (JSON array, empty list if no slug) + Ashby: GET https://api.ashbyhq.com/posting-api/job-board/{slug} + → {"jobs":[{"jobUrl":"..."},...], "apiVersion":"..."} + NOTE: GET only — POST is NOT used (CLAUDE.md gotcha was wrong) + Workday: POST https://{host}/wday/cxs/{tenant}/{site}/jobs + body {"appliedFacets":{},"limit":1,"offset":0,"searchText":""} + → {"total":N,"jobPostings":[{"externalPath":"/job/..."},...]} + Job URL: https://{host}/en-US/{site}{externalPath} """ -# TODO (Stage 2, tier 1): implement per CLAUDE.md "Stage 2 — ATS detection". -# Detect Greenhouse / Lever / Ashby / Workday from the company website, then call -# their public JSON APIs (no login needed). On success, return both the careers page URL -# AND the first job posting URL (so Stage 3 can skip its own cascade for ATS companies). -# -# Confirmed ATS JSON field shapes (verify live before trusting — see CLAUDE.md Gotchas): -# Greenhouse: GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs -# → {"jobs": [{"absolute_url": "...", ...}, ...]} -# Lever: GET https://api.lever.co/v0/postings/{company}?mode=json -# → [{"hostedUrl": "...", ...}, ...] -# Ashby: POST https://api.ashbyhq.com/posting-api/job-board/{slug} -# → {"jobs": [{"jobUrl": "...", ...}, ...]} -# Workday: varies by tenant — needs per-tenant discovery logic +from __future__ import annotations + +import logging +import re +from urllib.parse import urlsplit + +import httpx +from pydantic import BaseModel + +from ..http import build_client, request_with_retries +from ..resolve import _slug + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Detection patterns — keyed by ATS name, applied to raw HTML or URL strings. +# Each pattern yields named groups used to build board coordinates. +# --------------------------------------------------------------------------- + +# Greenhouse: match board embed script or iframe src or direct board links. +# Captures: slug +_GH_PATTERN = re.compile( + r"(?:" + r"boards\.greenhouse\.io/(?:embed/job_board\?for=|)(?P[A-Za-z0-9_-]+)" + r"|job-boards\.greenhouse\.io/(?P[A-Za-z0-9_-]+)" + r"|greenhouse\.io/embed/job_board\?for=(?P[A-Za-z0-9_-]+)" + r")", + re.IGNORECASE, +) + +# Lever: board is always jobs.lever.co/{slug} +# Captures: slug +_LEVER_PATTERN = re.compile( + r"jobs\.lever\.co/(?P[A-Za-z0-9_-]+)", + re.IGNORECASE, +) + +# Ashby: board is always jobs.ashbyhq.com/{slug}; slugs may be mixed-case +# Captures: slug +_ASHBY_PATTERN = re.compile( + r"jobs\.ashbyhq\.com/(?P[A-Za-z0-9_%-]+)", + re.IGNORECASE, +) + +# Workday: tenant.wdN.myworkdayjobs.com[/locale]/site +# Captures: tenant, wdnum (the N in wdN), site +# Locale portion (e.g. "en-US/") is optional and consumed but not captured. +_WORKDAY_PATTERN = re.compile( + r"(?P[A-Za-z0-9_-]+)\.wd(?P\d+)\.myworkdayjobs\.com" + r"(?:/[a-z]{2}-[A-Z]{2})?" # optional locale segment + r"/(?P[A-Za-z0-9_%-]+)", + re.IGNORECASE, +) + +# Ordered list used by both detect_ats_in_html and detect_ats_in_url +_ATS_CHECKS: list[tuple[str, re.Pattern[str]]] = [ + ("greenhouse", _GH_PATTERN), + ("lever", _LEVER_PATTERN), + ("ashby", _ASHBY_PATTERN), + ("workday", _WORKDAY_PATTERN), +] + + +# --------------------------------------------------------------------------- +# Data models +# --------------------------------------------------------------------------- + + +class ATSBoard(BaseModel): + """Coordinates for a detected ATS board (detection output).""" + + ats_name: str # greenhouse | lever | ashby | workday + slug: str # board slug / company identifier + careers_url: str # canonical human-readable careers board URL + wd_host: str | None = None # workday only: full myworkdayjobs.com host + wd_tenant: str | None = None # workday only: tenant portion of host + wd_site: str | None = None # workday only: site path segment + + +class ATSFetch(BaseModel): + """Per-platform API fetch result; single source of truth for response field shapes.""" + + first_url: str | None = None # first open-position URL from the board + job_count: int = 0 # total jobs reported by the board + org_name: str | None = None # company_name from the API (Greenhouse only) + + +class ATSResult(BaseModel): + """Output of detect_and_fetch / recover_via_slug_guess: careers URL + optional first-job URL.""" + + ats_name: str + careers_url: str + position_url: str | None = None + job_count: int = 0 + + +# --------------------------------------------------------------------------- +# Detection +# --------------------------------------------------------------------------- + + +def detect_ats_in_html(html: str) -> ATSBoard | None: + """Scan page HTML for known ATS signals; return the first match or None. + + Tries patterns in order: Greenhouse → Lever → Ashby → Workday. + Pure function; no network calls. + """ + for ats_name, pattern in _ATS_CHECKS: + m = pattern.search(html) + if m: + board = _build_board(ats_name, m) + if board: + return board + return None + + +def detect_ats_in_url(url: str) -> ATSBoard | None: + """Scan a single URL string for ATS board coordinates; return first match or None. + + Used by the cascade finalizer to recognise when a heuristic-found careers + link is itself an ATS board (e.g. jobs.lever.co/acme). + Pure function; no network calls. + """ + for ats_name, pattern in _ATS_CHECKS: + m = pattern.search(url) + if m: + board = _build_board(ats_name, m) + if board: + return board + return None + + +def _build_board(ats_name: str, m: re.Match[str]) -> ATSBoard | None: + """Construct an ATSBoard from a regex match; return None if slug is empty.""" + if ats_name == "greenhouse": + slug = m.group("slug1") or m.group("slug2") or m.group("slug3") or "" + slug = slug.strip() + if not slug: + return None + return ATSBoard( + ats_name="greenhouse", + slug=slug, + careers_url=f"https://boards.greenhouse.io/{slug}", + ) + + if ats_name == "lever": + slug = (m.group("slug") or "").strip() + if not slug: + return None + return ATSBoard( + ats_name="lever", + slug=slug, + careers_url=f"https://jobs.lever.co/{slug}", + ) + + if ats_name == "ashby": + slug = (m.group("slug") or "").strip() + if not slug: + return None + return ATSBoard( + ats_name="ashby", + slug=slug, + careers_url=f"https://jobs.ashbyhq.com/{slug}", + ) + + if ats_name == "workday": + tenant = (m.group("tenant") or "").strip() + wdnum = (m.group("wdnum") or "").strip() + site = (m.group("site") or "").strip() + if not (tenant and wdnum and site): + return None + host = f"{tenant}.wd{wdnum}.myworkdayjobs.com" + return ATSBoard( + ats_name="workday", + slug=f"{tenant}/{site}", + careers_url=f"https://{host}/en-US/{site}", + wd_host=host, + wd_tenant=tenant, + wd_site=site, + ) + + return None # pragma: no cover + + +# --------------------------------------------------------------------------- +# Per-ATS fetch functions — each returns ATSFetch +# --------------------------------------------------------------------------- + + +def _fetch_greenhouse(board: ATSBoard, client: httpx.Client) -> ATSFetch: + """Call Greenhouse boards API; return ATSFetch with first_url, job_count, org_name.""" + url = f"https://boards-api.greenhouse.io/v1/boards/{board.slug}/jobs?per_page=1" + try: + resp = request_with_retries(client, "GET", url, max_retries=2) + if resp.status_code != 200: + logger.warning("greenhouse(%s): HTTP %s", board.slug, resp.status_code) + return ATSFetch() + data = resp.json() + jobs = data.get("jobs") or [] + count = (data.get("meta") or {}).get("total", len(jobs)) + first_url = jobs[0].get("absolute_url") if jobs else None + org_name = jobs[0].get("company_name") if jobs else None + return ATSFetch(first_url=first_url, job_count=count, org_name=org_name) + except Exception as exc: + logger.warning("greenhouse(%s): fetch error: %s", board.slug, exc) + return ATSFetch() + + +def _fetch_lever(board: ATSBoard, client: httpx.Client) -> ATSFetch: + """Call Lever postings API; return ATSFetch with first_url and job_count.""" + url = f"https://api.lever.co/v0/postings/{board.slug}?mode=json&limit=1" + try: + resp = request_with_retries(client, "GET", url, max_retries=2) + if resp.status_code != 200: + logger.warning("lever(%s): HTTP %s", board.slug, resp.status_code) + return ATSFetch() + data = resp.json() + if not isinstance(data, list): + logger.warning("lever(%s): unexpected response type %s", board.slug, type(data)) + return ATSFetch() + first_url = data[0].get("hostedUrl") if data else None + return ATSFetch(first_url=first_url, job_count=len(data)) + except Exception as exc: + logger.warning("lever(%s): fetch error: %s", board.slug, exc) + return ATSFetch() + + +def _fetch_ashby(board: ATSBoard, client: httpx.Client) -> ATSFetch: + """Call Ashby job-board API (GET); return ATSFetch with first_url and job_count.""" + url = f"https://api.ashbyhq.com/posting-api/job-board/{board.slug}" + try: + resp = request_with_retries(client, "GET", url, max_retries=2) + if resp.status_code != 200: + logger.warning("ashby(%s): HTTP %s", board.slug, resp.status_code) + return ATSFetch() + data = resp.json() + jobs = data.get("jobs") or [] + first_url = jobs[0].get("jobUrl") if jobs else None + return ATSFetch(first_url=first_url, job_count=len(jobs)) + except Exception as exc: + logger.warning("ashby(%s): fetch error: %s", board.slug, exc) + return ATSFetch() + + +def _fetch_workday(board: ATSBoard, client: httpx.Client) -> ATSFetch: + """Call Workday CXS jobs endpoint (POST); return ATSFetch with first_url and total count.""" + if not (board.wd_host and board.wd_tenant and board.wd_site): + return ATSFetch() + api_url = ( + f"https://{board.wd_host}/wday/cxs/{board.wd_tenant}/{board.wd_site}/jobs" + ) + payload = {"appliedFacets": {}, "limit": 1, "offset": 0, "searchText": ""} + try: + resp = request_with_retries( + client, "POST", api_url, + json=payload, + headers={"Content-Type": "application/json"}, + max_retries=2, + ) + if resp.status_code != 200: + logger.warning("workday(%s): HTTP %s", board.slug, resp.status_code) + return ATSFetch() + data = resp.json() + total = data.get("total", 0) + postings = data.get("jobPostings") or [] + first_url: str | None = None + if postings: + ext_path = postings[0].get("externalPath") or "" + if ext_path: + first_url = ( + f"https://{board.wd_host}/en-US/{board.wd_site}{ext_path}" + ) + return ATSFetch(first_url=first_url, job_count=total) + except Exception as exc: + logger.warning("workday(%s): fetch error: %s", board.slug, exc) + return ATSFetch() + + +_FETCH_DISPATCH: dict[str, object] = { + "greenhouse": _fetch_greenhouse, + "lever": _fetch_lever, + "ashby": _fetch_ashby, + "workday": _fetch_workday, +} + + +# --------------------------------------------------------------------------- +# Public orchestrator — HTML-detection path (Tier 1) +# --------------------------------------------------------------------------- + + +def detect_and_fetch( + website: str, + client: httpx.Client, + *, + homepage_html: str | None = None, +) -> ATSResult | None: + """Detect the ATS for a company website and fetch the first job via its API. + + If *homepage_html* is provided it is used directly (avoids a redundant GET). + Returns an ATSResult on success (position_url may be None if the board has + no live jobs), or None if no ATS was detected or the homepage fetch failed. + """ + html = homepage_html + if html is None: + try: + resp = request_with_retries(client, "GET", website, max_retries=1) + if resp.status_code < 400: + html = resp.text + except Exception as exc: + logger.warning("ats detect_and_fetch(%s): homepage fetch error: %s", website, exc) + return None + + if not html: + return None + + board = detect_ats_in_html(html) + if board is None: + return None + + logger.info( + "ats(%s): detected %s slug=%s careers_url=%s", + website, board.ats_name, board.slug, board.careers_url, + ) + + fetch_fn = _FETCH_DISPATCH.get(board.ats_name) + if fetch_fn is None: # pragma: no cover + return ATSResult(ats_name=board.ats_name, careers_url=board.careers_url) + + fetch = fetch_fn(board, client) # type: ignore[operator] + logger.info( + "ats(%s): %s board has %s jobs; first_url=%s", + website, board.ats_name, fetch.job_count, fetch.first_url, + ) + return ATSResult( + ats_name=board.ats_name, + careers_url=board.careers_url, + position_url=fetch.first_url, + job_count=fetch.job_count, + ) + + +# --------------------------------------------------------------------------- +# Slug-guess recovery helpers — Tier 1b (JS-embedded / SPA boards) +# --------------------------------------------------------------------------- + + +def _domain_stem(website: str) -> str | None: + """Extract the first DNS label (lowercased) from a website URL, stripping www. + + Example: 'https://www.anthropic.com' → 'anthropic' + """ + try: + parts = urlsplit(website) + host = parts.netloc or parts.path.split("/")[0] + host = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE).lower() + stem = host.split(".")[0] + return stem or None + except Exception: + return None + + +def _slug_candidates(website: str, company_name: str | None) -> list[str]: + """Return ordered, deduped ATS slug candidates (≤ 3) for a company. + + Order: domain stem first (most specific/unique), then normalized company name. + """ + candidates: list[str] = [] + seen: set[str] = set() + + stem = _domain_stem(website) + if stem and stem not in seen: + candidates.append(stem) + seen.add(stem) + + if company_name: + name_slug = _slug(company_name) + if name_slug and name_slug not in seen: + candidates.append(name_slug) + seen.add(name_slug) + + return candidates[:3] + + +def _board_from_slug(ats_name: str, slug: str) -> ATSBoard: + """Construct an ATSBoard for Greenhouse / Lever / Ashby slug-guess probing.""" + if ats_name == "greenhouse": + return ATSBoard( + ats_name="greenhouse", + slug=slug, + careers_url=f"https://boards.greenhouse.io/{slug}", + ) + if ats_name == "lever": + return ATSBoard( + ats_name="lever", + slug=slug, + careers_url=f"https://jobs.lever.co/{slug}", + ) + # ashby + return ATSBoard( + ats_name="ashby", + slug=slug, + careers_url=f"https://jobs.ashbyhq.com/{slug}", + ) + + +def _loose_name_match(input_name: str, org_name: str) -> bool: + """Return True if company names loosely match (slug of one is a substring of the other). + + If either side normalizes to empty, returns True (unverifiable → don't reject). + """ + a = _slug(input_name) or "" + b = _slug(org_name) or "" + if not a or not b: + return True + return a in b or b in a + + +# Platforms probed in slug-guess recovery (Workday excluded — needs tenant+site). +_SLUG_GUESS_PLATFORMS = ("greenhouse", "lever", "ashby") + + +def recover_via_slug_guess( + website: str, + company_name: str | None, + client: httpx.Client, +) -> ATSResult | None: + """Probe Greenhouse/Lever/Ashby APIs with guessed slugs when HTML detection misses. + + Used as Tier 1b in the cascade — catches companies whose ATS board is + injected via client-side JS and therefore invisible to static HTML detection + (e.g. Anthropic's Greenhouse board rendered by Next.js). + + Slug candidates: domain stem first (e.g. anthropic.com → 'anthropic'), then + the normalized company name. First board with job_count > 0 wins. + + False-positive guard: if Greenhouse returns a company_name, it is loosely + cross-checked against the input company_name; a clear mismatch is rejected. + + Returns ATSResult on success or None on all-miss. + """ + candidates = _slug_candidates(website, company_name) + if not candidates: + return None + + for slug in candidates: + for ats_name in _SLUG_GUESS_PLATFORMS: + board = _board_from_slug(ats_name, slug) + fetch_fn = _FETCH_DISPATCH.get(ats_name) + if fetch_fn is None: # pragma: no cover + continue + try: + fetch = fetch_fn(board, client) # type: ignore[operator] + except Exception as exc: + logger.debug( + "recover_via_slug_guess(%s/%s/%s): error: %s", + website, ats_name, slug, exc, + ) + continue + + if not fetch.first_url or fetch.job_count == 0: + continue + + # Cross-check org name if the platform provides it (Greenhouse only). + if company_name and fetch.org_name: + if not _loose_name_match(company_name, fetch.org_name): + logger.info( + "recover_via_slug_guess(%s): %s slug=%s org_name=%r " + "does not match input %r — skipping", + website, ats_name, slug, fetch.org_name, company_name, + ) + continue + + logger.info( + "recover_via_slug_guess(%s): hit %s slug=%s jobs=%s careers_url=%s", + website, ats_name, slug, fetch.job_count, board.careers_url, + ) + return ATSResult( + ats_name=ats_name, + careers_url=board.careers_url, + position_url=fetch.first_url, + job_count=fetch.job_count, + ) + + logger.info("recover_via_slug_guess(%s): all candidates missed", website) + return None diff --git a/jobsource/careers/cascade.py b/jobsource/careers/cascade.py index 828dcd4..8b2bad9 100644 --- a/jobsource/careers/cascade.py +++ b/jobsource/careers/cascade.py @@ -1,13 +1,234 @@ """find_careers_page(): orchestrate the Stage 2 tier cascade. -Scaffold stub -- not implemented yet. +Cascade order (return early on first success): + 1. ATS detection → ats.detect_and_fetch() confidence 0.95 + 2. URL patterns → heuristics.probe_url_patterns() 0.80 + 3. Homepage scan → heuristics.scan_homepage_links() 0.60 + 4. Sitemap → heuristics.parse_sitemap() 0.50 + 5. Cheap-LLM → classify_llm (stub, not implemented in this phase) + 6. Browser agent → agent_fallback (stub, not implemented in this phase) + +Returns a CareersResult with the URL, confidence, method string, and — when +the ATS tier resolves — the first open-position URL for free (Stage-3 shortcut). + +The optional *client* parameter follows the managed-client pattern from +resolve.py: supply an existing httpx.Client to reuse connections; otherwise a +short-lived client is created and closed here. """ -# TODO (Stage 2): implement per CLAUDE.md "Stage 2 — Find careers page (cascade, return on first hit)". -# Cascade order (return early on first success): -# 1. ATS detection → ats.detect_and_fetch() -# 2. URL patterns → heuristics.probe_url_patterns() -# 3. Homepage scan → heuristics.scan_homepage_links() -# 4. Sitemap → heuristics.parse_sitemap() -# 5. Cheap-LLM → classify_llm.classify_careers_link() -# 6. Browser agent → agent_fallback.run_fused_agent() (also handles Stage 3) -# Returns (careers_url: str | None, method: str, ats_name: str | None). +from __future__ import annotations + +import logging + +import httpx +from pydantic import BaseModel + +from ..http import build_client, request_with_retries +from . import ats as _ats +from . import heuristics as _heuristics + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Result model +# --------------------------------------------------------------------------- + + +class CareersResult(BaseModel): + """Typed output of find_careers_page().""" + + careers_url: str | None = None + confidence: float = 0.0 + # method values: "ats:{name}" | "url_pattern" | "homepage_scan" | "sitemap" | "none" + method: str = "none" + ats_name: str | None = None + # Free Stage-3 shortcut: populated when ATS tier resolves (first open job URL). + position_url: str | None = None + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + + +def find_careers_page( + website: str, + *, + company_name: str | None = None, + client: httpx.Client | None = None, +) -> CareersResult: + """Run the careers-page discovery cascade for one company website. + + *company_name* is optional; when provided it supplies a second slug candidate + for the Tier 1b slug-guess recovery and enables the org-name cross-check. + Returns a CareersResult. Never raises — tier failures fall through gracefully. + """ + _managed = client is None + if _managed: + client = build_client() + + try: + # Fetch the homepage once; shared by ATS detection and homepage-link scan. + homepage_html: str | None = _safe_get_html(website, client) + + # ------------------------------------------------------------------ + # Tier 1 — ATS detection + public JSON API + # ------------------------------------------------------------------ + try: + ats_result = _ats.detect_and_fetch( + website, client, homepage_html=homepage_html + ) + if ats_result is not None: + logger.info( + "cascade(%s): resolved via ats:%s careers_url=%s", + website, ats_result.ats_name, ats_result.careers_url, + ) + return CareersResult( + careers_url=ats_result.careers_url, + confidence=0.95, + method=f"ats:{ats_result.ats_name}", + ats_name=ats_result.ats_name, + position_url=ats_result.position_url, + ) + except Exception as exc: + logger.warning("cascade(%s): ats tier error: %s", website, exc) + + # ------------------------------------------------------------------ + # Tier 1b — Slug-guess ATS recovery (JS-embedded / SPA boards) + # ------------------------------------------------------------------ + try: + rec = _ats.recover_via_slug_guess(website, company_name, client) + if rec is not None: + logger.info( + "cascade(%s): resolved via ats:%s:slug_guess careers_url=%s", + website, rec.ats_name, rec.careers_url, + ) + return CareersResult( + careers_url=rec.careers_url, + confidence=0.90, + method=f"ats:{rec.ats_name}:slug_guess", + ats_name=rec.ats_name, + position_url=rec.position_url, + ) + except Exception as exc: + logger.warning("cascade(%s): slug_guess tier error: %s", website, exc) + + # ------------------------------------------------------------------ + # Tier 2 — URL-pattern probing + # ------------------------------------------------------------------ + try: + url = _heuristics.probe_url_patterns(website, client) + if url: + return _finalize(url, "url_pattern", 0.80, website, client) + except Exception as exc: + logger.warning("cascade(%s): url_pattern tier error: %s", website, exc) + + # ------------------------------------------------------------------ + # Tier 3 — Homepage link scan (reuse fetched HTML) + # ------------------------------------------------------------------ + try: + url = _heuristics.scan_homepage_links( + website, client, homepage_html=homepage_html + ) + if url: + return _finalize(url, "homepage_scan", 0.60, website, client) + except Exception as exc: + logger.warning("cascade(%s): homepage_scan tier error: %s", website, exc) + + # ------------------------------------------------------------------ + # Tier 4 — Sitemap + # ------------------------------------------------------------------ + try: + url = _heuristics.parse_sitemap(website, client) + if url: + return _finalize(url, "sitemap", 0.50, website, client) + except Exception as exc: + logger.warning("cascade(%s): sitemap tier error: %s", website, exc) + + # All deterministic tiers missed. + logger.info("cascade(%s): all deterministic tiers missed", website) + return CareersResult() + + finally: + if _managed: + client.close() + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _detect_ats_in_page(url: str, client: httpx.Client) -> "_ats.ATSBoard | None": + """Fetch a page and run ATS detection on its HTML; return ATSBoard or None.""" + try: + resp = request_with_retries(client, "GET", url, max_retries=0) + if resp.status_code < 400: + return _ats.detect_ats_in_html(resp.text) + except Exception: + pass + return None + + +def _safe_get_html(website: str, client: httpx.Client) -> str | None: + """Best-effort homepage fetch; return HTML text or None on any failure.""" + try: + resp = request_with_retries(client, "GET", website, max_retries=1) + if resp.status_code < 400: + return resp.text + logger.info("cascade: homepage GET %s returned HTTP %s", website, resp.status_code) + return None + except Exception as exc: + logger.info("cascade: homepage GET %s error: %s", website, exc) + return None + + +def _finalize( + url: str, + method: str, + confidence: float, + website: str, + client: httpx.Client, +) -> CareersResult: + """Attempt ATS-URL upgrade for heuristic hits; return a CareersResult. + + If the URL resolved by a heuristic tier is itself an ATS board page + (e.g. jobs.lever.co/acme), detect and fetch it so we can return a + position_url and upgrade the confidence to 0.95. + """ + board = _ats.detect_ats_in_url(url) + if board is None: + # The URL itself isn't an ATS board link; fetch the page and check its HTML. + # This catches companies like Vercel/Figma whose Greenhouse embed is only on /careers. + board = _detect_ats_in_page(url, client) + if board is not None: + try: + fetch_fn = _ats._FETCH_DISPATCH.get(board.ats_name) + if fetch_fn is not None: + fetch = fetch_fn(board, client) # type: ignore[operator] + upgraded_method = f"ats:{board.ats_name}" + logger.info( + "cascade(%s): %s hit upgraded to %s careers_url=%s", + website, method, upgraded_method, board.careers_url, + ) + return CareersResult( + careers_url=board.careers_url, + confidence=0.95, + method=upgraded_method, + ats_name=board.ats_name, + position_url=fetch.first_url, + ) + except Exception as exc: + logger.warning( + "cascade(%s): ats upgrade for %s failed: %s", website, method, exc + ) + + logger.info( + "cascade(%s): resolved via %s careers_url=%s confidence=%.2f", + website, method, url, confidence, + ) + return CareersResult( + careers_url=url, + confidence=confidence, + method=method, + ) diff --git a/jobsource/careers/heuristics.py b/jobsource/careers/heuristics.py index 21a465a..8e9dbea 100644 --- a/jobsource/careers/heuristics.py +++ b/jobsource/careers/heuristics.py @@ -1,11 +1,334 @@ """Deterministic careers-page heuristics: URL probing, homepage scan, sitemap (Stage 2, tiers 2–4). -Scaffold stub -- not implemented yet. +Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join and + subdomains careers.{domain}, jobs.{domain} via HTTP HEAD/GET. +Tier 3 — Homepage link scan: parse anchors from the homepage HTML, rank + by career/job keywords in href and text, return highest-scored link. +Tier 4 — Sitemap: fetch /sitemap.xml (and sitemap index children), scan + elements for career/job keywords, return the first match. """ -# TODO (Stage 2, tiers 2–4): implement per CLAUDE.md "Stage 2 — URL patterns / homepage / sitemap". -# Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join, -# careers.{domain}, jobs.{domain} via HTTP HEAD (or GET if HEAD fails). -# Tier 3 — Homepage link scan: fetch homepage HTML, parse with BeautifulSoup + lxml, -# rank anchors by career/job keywords in href/text, return highest-ranked. -# Tier 4 — Sitemap: fetch sitemap.xml (and sitemap index if present), scan for career/job URLs. -# Each function returns (url: str | None) so cascade.py can return early on first hit. +from __future__ import annotations + +import logging +import re +from urllib.parse import urljoin, urlsplit + +import httpx +from bs4 import BeautifulSoup + +from ..http import probe_url, request_with_retries + +logger = logging.getLogger(__name__) + +# URL path segments and keywords that signal a careers page. +_CAREER_PATH_PATTERNS: list[re.Pattern[str]] = [ + re.compile(p, re.IGNORECASE) + for p in [ + r"/career", # /careers, /career + r"/job", # /jobs, /job-listings + r"/join", # /join-us, /join-our-team + r"/work-with", # /work-with-us + r"/we-re-hiring", + r"/openings?", + r"/opportunities", + r"/positions?", + r"/vacancies", + r"/hiring", + ] +] + +# Weighted keyword scoring for anchor text and href paths. +# Tuples: (compiled pattern, score) +_HREF_WEIGHTS: list[tuple[re.Pattern[str], float]] = [ + (re.compile(r"/career", re.I), 3.0), + (re.compile(r"/job", re.I), 2.5), + (re.compile(r"/join", re.I), 2.0), + (re.compile(r"/opening", re.I), 2.0), + (re.compile(r"/position", re.I), 2.0), + (re.compile(r"/opportunit", re.I), 1.5), + (re.compile(r"/work.with", re.I), 1.5), + (re.compile(r"/hiring", re.I), 1.5), + (re.compile(r"/vacanc", re.I), 1.5), +] + +_TEXT_WEIGHTS: list[tuple[re.Pattern[str], float]] = [ + (re.compile(r"\bcareers?\b", re.I), 3.0), + (re.compile(r"\bjobs?\b", re.I), 2.5), + (re.compile(r"\bjoin\s+us\b", re.I), 2.0), + (re.compile(r"\bopen\s+positions?\b", re.I), 2.0), + (re.compile(r"\bwork\s+with\s+us\b", re.I), 1.5), + (re.compile(r"\bwe.?re\s+hiring\b", re.I), 1.5), + (re.compile(r"\bopportunities\b", re.I), 1.5), + (re.compile(r"\bhiring\b", re.I), 1.0), +] + +# Minimum score to accept a homepage link as a careers page candidate. +_SCORE_THRESHOLD = 2.0 + +# Maximum child sitemaps to fetch when processing a sitemap index. +_MAX_SITEMAP_CHILDREN = 5 + +# Maximum entries to scan across all sitemaps. +_MAX_LOC_ENTRIES = 500 + +# Soft-404 path indicators used to reject SPA-style error pages. +_SOFT_404_INDICATORS = ("notfound", "/404", "not-found", "/pagenot", "/error-page") + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _is_plausible_careers_url(original_url: str, final_url: str) -> bool: + """Return False for obvious false positives from redirect chains. + + Rejects: + - SPA soft-404 paths: final URL path contains "notfound", "/404", "not-found" + (e.g. Netflix /careers → /NotFound?prev=...) + - Off-brand cross-domain redirects: final domain shares no brand with original + (e.g. microsoft.com/careers → bing.com) + + Accepts legitimate cross-domain redirects where the brand is preserved + (e.g. amazon.com → amazon.jobs). + """ + orig = urlsplit(original_url) + final = urlsplit(final_url) + + # 1. Reject soft-404 path indicators (case-insensitive). + final_path_lower = final.path.lower() + if any(indicator in final_path_lower for indicator in _SOFT_404_INDICATORS): + return False + + # 2. If domain changed, verify the brand name survives in the new host. + orig_host = re.sub(r"^www\.", "", orig.netloc, flags=re.IGNORECASE).lower() + final_host = re.sub(r"^www\.", "", final.netloc, flags=re.IGNORECASE).lower() + if orig_host != final_host: + brand = orig_host.split(".")[0] # e.g. "microsoft" from "microsoft.com" + if len(brand) > 3 and brand not in final_host: + return False + + return True + + +def _base_parts(website: str) -> tuple[str, str, str]: + """Return (scheme, host, root_domain) for a website URL. + + root_domain strips a leading 'www.' from the host so that subdomain + candidates like 'careers.{root_domain}' are formed correctly. + Example: 'https://www.acme.com/about' → ('https', 'www.acme.com', 'acme.com') + """ + parts = urlsplit(website) + scheme = parts.scheme or "https" + host = parts.netloc or parts.path.split("/")[0] + root = re.sub(r"^www\.", "", host, count=1, flags=re.IGNORECASE) + return scheme, host, root + + +# --------------------------------------------------------------------------- +# Tier 2 — URL-pattern probing +# --------------------------------------------------------------------------- + + +def probe_url_patterns(website: str, client: httpx.Client) -> str | None: + """Probe well-known career URL paths and subdomains; return first reachable URL. + + Probes in order: + /careers, /career, /jobs, /join-us, /join + careers.{root_domain}, jobs.{root_domain} + Uses HTTP HEAD with GET fallback via http.probe_url. + """ + scheme, host, root = _base_parts(website) + base = f"{scheme}://{host}" + + candidates: list[str] = [ + f"{base}/careers", + f"{base}/career", + f"{base}/jobs", + f"{base}/join-us", + f"{base}/join", + f"{scheme}://careers.{root}", + f"{scheme}://jobs.{root}", + ] + + for candidate in candidates: + result = probe_url(client, candidate) + if result and _is_plausible_careers_url(candidate, result): + logger.info("heuristics.probe_url_patterns(%s): hit url=%s", website, result) + return result + if result: + logger.info( + "heuristics.probe_url_patterns(%s): rejected redirect %s → %s", + website, candidate, result, + ) + + logger.info("heuristics.probe_url_patterns(%s): no pattern matched", website) + return None + + +# --------------------------------------------------------------------------- +# Tier 3 — Homepage link scan +# --------------------------------------------------------------------------- + + +def scan_homepage_links( + website: str, + client: httpx.Client, + *, + homepage_html: str | None = None, +) -> str | None: + """Rank anchors on the homepage by career/job keywords; return best match. + + If *homepage_html* is provided it is used directly. Otherwise the homepage + is fetched. Returns the highest-scoring anchor href that exceeds the + threshold, or None. + """ + html = homepage_html + if html is None: + try: + resp = request_with_retries(client, "GET", website, max_retries=1) + if resp.status_code >= 400: + logger.warning( + "heuristics.scan_homepage_links(%s): HTTP %s", website, resp.status_code + ) + return None + html = resp.text + except Exception as exc: + logger.warning( + "heuristics.scan_homepage_links(%s): fetch error: %s", website, exc + ) + return None + + if not html: + return None + + soup = BeautifulSoup(html, "lxml") + best_url: str | None = None + best_score: float = 0.0 + + for tag in soup.find_all("a", href=True): + href: str = tag["href"].strip() + # Skip non-HTTP links and fragment-only anchors. + if not href or href.startswith(("mailto:", "tel:", "#", "javascript:")): + continue + + full_url = urljoin(website, href) + # Keep only http(s) links. + if not full_url.startswith(("http://", "https://")): + continue + + score = _score_anchor(href, tag.get_text(separator=" ", strip=True)) + if score > best_score: + best_score = score + best_url = full_url + + if best_url and best_score >= _SCORE_THRESHOLD: + logger.info( + "heuristics.scan_homepage_links(%s): hit url=%s score=%.1f", + website, best_url, best_score, + ) + return best_url + + logger.info( + "heuristics.scan_homepage_links(%s): no link above threshold (best=%.1f)", + website, best_score, + ) + return None + + +def _score_anchor(href: str, text: str) -> float: + """Compute a relevance score for an anchor based on its href path and text.""" + score = 0.0 + for pattern, weight in _HREF_WEIGHTS: + if pattern.search(href): + score += weight + for pattern, weight in _TEXT_WEIGHTS: + if pattern.search(text): + score += weight + return score + + +# --------------------------------------------------------------------------- +# Tier 4 — Sitemap +# --------------------------------------------------------------------------- + +_CAREER_URL_RE = re.compile( + r"/(career|job|join|opening|position|opportunit|vacanc|hiring)", + re.IGNORECASE, +) + + +def parse_sitemap(website: str, client: httpx.Client) -> str | None: + """Fetch /sitemap.xml and scan URLs for career/job keywords. + + If the sitemap is an index, fetches up to _MAX_SITEMAP_CHILDREN child + sitemaps. Scans up to _MAX_LOC_ENTRIES entries in total. + Returns the first matching URL, or None. + """ + scheme, host, _ = _base_parts(website) + sitemap_url = f"{scheme}://{host}/sitemap.xml" + try: + xml = _fetch_xml(client, sitemap_url) + except Exception as exc: + logger.info("heuristics.parse_sitemap(%s): fetch error: %s", website, exc) + return None + + if xml is None: + logger.info("heuristics.parse_sitemap(%s): sitemap not found", website) + return None + + result = _scan_sitemap_xml(xml, website, client) + if result: + logger.info("heuristics.parse_sitemap(%s): hit url=%s", website, result) + else: + logger.info("heuristics.parse_sitemap(%s): no career URL found", website) + return result + + +def _fetch_xml(client: httpx.Client, url: str) -> str | None: + """GET a URL and return the text if the response is < 400, else None.""" + resp = request_with_retries(client, "GET", url, max_retries=1) + if resp.status_code >= 400: + return None + return resp.text + + +def _scan_sitemap_xml(xml: str, website: str, client: httpx.Client) -> str | None: + """Parse sitemap XML; handle sitemap index by fetching children.""" + soup = BeautifulSoup(xml, "xml") + + # Sitemap index: contains + sitemap_tags = soup.find_all("sitemap") + if sitemap_tags: + child_locs = [ + tag.find("loc").get_text(strip=True) + for tag in sitemap_tags + if tag.find("loc") + ] + for child_url in child_locs[:_MAX_SITEMAP_CHILDREN]: + try: + child_xml = _fetch_xml(client, child_url) + except Exception: + continue + if child_xml: + result = _scan_locs(child_xml) + if result: + return result + return None + + # Plain sitemap: contains + return _scan_locs(xml) + + +def _scan_locs(xml: str) -> str | None: + """Scan elements in a sitemap for career/job keywords.""" + soup = BeautifulSoup(xml, "xml") + count = 0 + for loc_tag in soup.find_all("loc"): + if count >= _MAX_LOC_ENTRIES: + break + count += 1 + loc: str = loc_tag.get_text(strip=True) + if _CAREER_URL_RE.search(loc): + return loc + return None diff --git a/jobsource/http.py b/jobsource/http.py index d801880..67117dd 100644 --- a/jobsource/http.py +++ b/jobsource/http.py @@ -18,6 +18,7 @@ from .config import get_settings logger = logging.getLogger(__name__) _RETRY_STATUS = frozenset({429, 500, 502, 503, 504}) +_HEAD_NOT_SUPPORTED = frozenset({405, 501}) def default_headers() -> dict[str, str]: @@ -95,3 +96,21 @@ def request_with_retries( if last_exc is not None: # pragma: no cover - defensive raise last_exc raise RuntimeError("request_with_retries exhausted without a response") + + +def probe_url(client: httpx.Client, url: str) -> str | None: + """Probe a URL with HEAD (fallback GET on 405/501); return final URL or None. + + Returns the str representation of the final URL after redirects when the + server responds with a non-error status (<400). Returns None on any + network error or error status. + """ + try: + resp = request_with_retries(client, "HEAD", url, max_retries=1) + if resp.status_code in _HEAD_NOT_SUPPORTED: + resp = request_with_retries(client, "GET", url, max_retries=1) + if resp.status_code < 400: + return str(resp.url) + return None + except Exception: + return None diff --git a/tests/test_ats.py b/tests/test_ats.py new file mode 100644 index 0000000..42b7306 --- /dev/null +++ b/tests/test_ats.py @@ -0,0 +1,749 @@ +"""Tests for jobsource/careers/ats.py — all network-free via monkeypatching.""" +from __future__ import annotations + +import pytest + +from jobsource.careers.ats import ( + ATSBoard, + ATSFetch, + ATSResult, + _board_from_slug, + _domain_stem, + _fetch_ashby, + _fetch_greenhouse, + _fetch_lever, + _fetch_workday, + _loose_name_match, + _slug_candidates, + detect_and_fetch, + detect_ats_in_html, + detect_ats_in_url, + recover_via_slug_guess, +) + + +# --------------------------------------------------------------------------- +# Tiny fake HTTP response for monkeypatching request_with_retries +# --------------------------------------------------------------------------- + + +class FakeResponse: + def __init__(self, status_code: int, body: object, url: str = "https://example.com"): + self.status_code = status_code + self._body = body + self.url = url + self.text = str(body) + + def json(self) -> object: + return self._body + + +class FakeClient: + """Stands in for httpx.Client; never actually used in network calls here.""" + + +# --------------------------------------------------------------------------- +# detect_ats_in_html — Greenhouse +# --------------------------------------------------------------------------- + + +class TestDetectATSInHtmlGreenhouse: + def test_boards_greenhouse_script_tag(self): + html = '' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "greenhouse" + assert board.slug == "airbnb" + assert board.careers_url == "https://boards.greenhouse.io/airbnb" + + def test_boards_greenhouse_direct_link(self): + html = 'Jobs' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "greenhouse" + assert board.slug == "acme" + + def test_job_boards_subdomain(self): + html = 'Jobs' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "greenhouse" + assert board.slug == "stripe" + + def test_no_match_returns_none(self): + assert detect_ats_in_html("Nothing here") is None + + +# --------------------------------------------------------------------------- +# detect_ats_in_html — Lever +# --------------------------------------------------------------------------- + + +class TestDetectATSInHtmlLever: + def test_jobs_lever_link(self): + html = 'Open roles' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "lever" + assert board.slug == "leverdemo" + assert board.careers_url == "https://jobs.lever.co/leverdemo" + + def test_lever_embed_script(self): + html = 'var lever = "jobs.lever.co/acme-corp";' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "lever" + assert board.slug == "acme-corp" + + +# --------------------------------------------------------------------------- +# detect_ats_in_html — Ashby +# --------------------------------------------------------------------------- + + +class TestDetectATSInHtmlAshby: + def test_jobs_ashbyhq_link(self): + html = 'Careers' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "ashby" + assert board.slug == "Ramp" + assert board.careers_url == "https://jobs.ashbyhq.com/Ramp" + + def test_lowercase_slug(self): + html = 'Join us' + board = detect_ats_in_html(html) + assert board is not None + assert board.slug == "linear" + + +# --------------------------------------------------------------------------- +# detect_ats_in_html — Workday +# --------------------------------------------------------------------------- + + +class TestDetectATSInHtmlWorkday: + def test_myworkdayjobs_link(self): + html = 'Jobs' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "workday" + assert board.wd_host == "nvidia.wd5.myworkdayjobs.com" + assert board.wd_tenant == "nvidia" + assert board.wd_site == "NVIDIAExternalCareerSite" + assert "en-US" in board.careers_url + assert board.careers_url == "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite" + + def test_workday_without_locale(self): + html = 'Careers' + board = detect_ats_in_html(html) + assert board is not None + assert board.ats_name == "workday" + assert board.wd_site == "AcmeCareers" + + def test_workday_missing_site_returns_none(self): + # Just the host with no path — can't form a board + html = 'https://acme.wd1.myworkdayjobs.com' + board = detect_ats_in_html(html) + assert board is None + + +# --------------------------------------------------------------------------- +# detect_ats_in_url +# --------------------------------------------------------------------------- + + +class TestDetectATSInUrl: + def test_greenhouse_url(self): + board = detect_ats_in_url("https://boards.greenhouse.io/stripe") + assert board is not None + assert board.ats_name == "greenhouse" + assert board.slug == "stripe" + + def test_lever_url(self): + board = detect_ats_in_url("https://jobs.lever.co/leverdemo") + assert board is not None + assert board.ats_name == "lever" + + def test_ashby_url(self): + board = detect_ats_in_url("https://jobs.ashbyhq.com/linear") + assert board is not None + assert board.ats_name == "ashby" + + def test_workday_url(self): + board = detect_ats_in_url( + "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite" + ) + assert board is not None + assert board.ats_name == "workday" + + def test_non_ats_url_returns_none(self): + assert detect_ats_in_url("https://www.acme.com/careers") is None + + +# --------------------------------------------------------------------------- +# _fetch_greenhouse +# --------------------------------------------------------------------------- + + +class TestFetchGreenhouse: + def _board(self, slug: str = "airbnb") -> ATSBoard: + return ATSBoard( + ats_name="greenhouse", + slug=slug, + careers_url=f"https://boards.greenhouse.io/{slug}", + ) + + def test_extracts_absolute_url(self, monkeypatch): + fake_body = { + "jobs": [{"absolute_url": "https://careers.airbnb.com/positions/123", + "company_name": "Airbnb"}], + "meta": {"total": 42}, + } + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda client, method, url, **kw: FakeResponse(200, fake_body), + ) + fetch = _fetch_greenhouse(self._board(), FakeClient()) + assert fetch.first_url == "https://careers.airbnb.com/positions/123" + assert fetch.job_count == 42 + assert fetch.org_name == "Airbnb" + + def test_empty_jobs_list(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, {"jobs": [], "meta": {"total": 0}}), + ) + fetch = _fetch_greenhouse(self._board(), FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + assert fetch.org_name is None + + def test_non_200_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(404, {}), + ) + fetch = _fetch_greenhouse(self._board(), FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + + def test_network_error_returns_none(self, monkeypatch): + def boom(*a, **kw): + raise RuntimeError("network down") + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", boom) + fetch = _fetch_greenhouse(self._board(), FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + + +# --------------------------------------------------------------------------- +# _fetch_lever +# --------------------------------------------------------------------------- + + +class TestFetchLever: + def _board(self, slug: str = "leverdemo") -> ATSBoard: + return ATSBoard( + ats_name="lever", + slug=slug, + careers_url=f"https://jobs.lever.co/{slug}", + ) + + def test_extracts_hosted_url(self, monkeypatch): + fake_body = [{"hostedUrl": "https://jobs.lever.co/leverdemo/abc-123"}] + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, fake_body), + ) + fetch = _fetch_lever(self._board(), FakeClient()) + assert fetch.first_url == "https://jobs.lever.co/leverdemo/abc-123" + assert fetch.job_count == 1 + + def test_empty_list(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, []), + ) + fetch = _fetch_lever(self._board(), FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + + def test_non_list_response(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, {"error": "not found"}), + ) + fetch = _fetch_lever(self._board(), FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + + def test_non_200_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(404, []), + ) + fetch = _fetch_lever(self._board(), FakeClient()) + assert fetch.first_url is None + + +# --------------------------------------------------------------------------- +# _fetch_ashby +# --------------------------------------------------------------------------- + + +class TestFetchAshby: + def _board(self, slug: str = "Ramp") -> ATSBoard: + return ATSBoard( + ats_name="ashby", + slug=slug, + careers_url=f"https://jobs.ashbyhq.com/{slug}", + ) + + def test_extracts_job_url(self, monkeypatch): + fake_body = { + "jobs": [{"jobUrl": "https://jobs.ashbyhq.com/Ramp/abc-def"}], + "apiVersion": "1", + } + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, fake_body), + ) + fetch = _fetch_ashby(self._board(), FakeClient()) + assert fetch.first_url == "https://jobs.ashbyhq.com/Ramp/abc-def" + assert fetch.job_count == 1 + + def test_empty_jobs(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, {"jobs": []}), + ) + fetch = _fetch_ashby(self._board(), FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + + def test_network_error_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")), + ) + fetch = _fetch_ashby(self._board(), FakeClient()) + assert fetch.first_url is None + + +# --------------------------------------------------------------------------- +# _fetch_workday +# --------------------------------------------------------------------------- + + +class TestFetchWorkday: + def _board(self) -> ATSBoard: + return ATSBoard( + ats_name="workday", + slug="nvidia/NVIDIAExternalCareerSite", + careers_url="https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite", + wd_host="nvidia.wd5.myworkdayjobs.com", + wd_tenant="nvidia", + wd_site="NVIDIAExternalCareerSite", + ) + + def test_builds_full_job_url(self, monkeypatch): + fake_body = { + "total": 2000, + "jobPostings": [{"externalPath": "/job/US/SWE_JR123"}], + } + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, fake_body), + ) + fetch = _fetch_workday(self._board(), FakeClient()) + assert fetch.first_url == "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite/job/US/SWE_JR123" + assert fetch.job_count == 2000 + + def test_empty_postings(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, {"total": 0, "jobPostings": []}), + ) + fetch = _fetch_workday(self._board(), FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + + def test_missing_wd_coords_returns_none(self): + board = ATSBoard( + ats_name="workday", slug="x", careers_url="https://x.wd1.myworkdayjobs.com" + ) + fetch = _fetch_workday(board, FakeClient()) + assert fetch.first_url is None + assert fetch.job_count == 0 + + def test_non_200_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(403, {}), + ) + fetch = _fetch_workday(self._board(), FakeClient()) + assert fetch.first_url is None + + +# --------------------------------------------------------------------------- +# detect_and_fetch orchestration +# --------------------------------------------------------------------------- + + +class TestDetectAndFetch: + def test_greenhouse_full_flow(self, monkeypatch): + html = 'Jobs' + job_resp = { + "jobs": [{"absolute_url": "https://careers.airbnb.com/positions/1"}], + "meta": {"total": 5}, + } + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(200, job_resp), + ) + result = detect_and_fetch("https://www.airbnb.com", FakeClient(), homepage_html=html) + assert result is not None + assert result.ats_name == "greenhouse" + assert result.careers_url == "https://boards.greenhouse.io/airbnb" + assert result.position_url == "https://careers.airbnb.com/positions/1" + assert result.job_count == 5 + + def test_no_ats_returns_none(self, monkeypatch): + html = "No ATS here" + result = detect_and_fetch("https://www.example.com", FakeClient(), homepage_html=html) + assert result is None + + def test_api_failure_returns_result_without_position_url(self, monkeypatch): + html = 'Jobs' + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(500, []), + ) + result = detect_and_fetch("https://www.acme.com", FakeClient(), homepage_html=html) + assert result is not None + assert result.ats_name == "lever" + assert result.careers_url == "https://jobs.lever.co/acme" + assert result.position_url is None + + def test_homepage_fetch_failure_returns_none(self, monkeypatch): + """When homepage_html is None and the fetch fails, return None.""" + def boom(*a, **kw): + raise RuntimeError("connection refused") + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", boom) + result = detect_and_fetch("https://www.example.com", FakeClient()) + assert result is None + + def test_uses_provided_html_without_fetching(self, monkeypatch): + """If homepage_html is provided, request_with_retries is only called for the API.""" + html = 'Jobs' + calls: list[str] = [] + job_resp = {"jobs": [{"jobUrl": "https://jobs.ashbyhq.com/linear/xyz"}]} + + def fake_req(client, method, url, **kw): + calls.append(url) + return FakeResponse(200, job_resp) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = detect_and_fetch("https://www.linear.app", FakeClient(), homepage_html=html) + assert result is not None + # Only one call: the API fetch (not the homepage) + assert len(calls) == 1 + assert "ashby" in calls[0] + + +# --------------------------------------------------------------------------- +# Pure-unit helpers +# --------------------------------------------------------------------------- + + +class TestDomainStem: + def test_strips_www(self): + assert _domain_stem("https://www.anthropic.com") == "anthropic" + + def test_no_www(self): + assert _domain_stem("https://linear.app") == "linear" + + def test_with_path(self): + assert _domain_stem("https://www.figma.com/careers/") == "figma" + + def test_invalid_returns_none(self): + assert _domain_stem("") is None or isinstance(_domain_stem(""), (str, type(None))) + + +class TestSlugCandidates: + def test_domain_stem_first(self): + candidates = _slug_candidates("https://www.anthropic.com", "Anthropic") + assert candidates[0] == "anthropic" + + def test_deduplicates_stem_and_name(self): + # stem == normalized name → only one entry + candidates = _slug_candidates("https://www.anthropic.com", "Anthropic") + assert candidates.count("anthropic") == 1 + + def test_different_stem_and_name(self): + # stem differs from normalized name → both appear + candidates = _slug_candidates("https://www.acmecorp.com", "Acme Corp Inc") + assert "acmecorp" in candidates + assert "acmecorp" in candidates or "acmecorp" in candidates + # normalized name strips "Inc" → "acmecorp" + + def test_name_only_candidate_when_stem_equal(self): + # When stem and slug match, only one entry + candidates = _slug_candidates("https://ramp.com", "Ramp") + assert len(candidates) == 1 + assert candidates[0] == "ramp" + + def test_no_company_name_uses_stem_only(self): + candidates = _slug_candidates("https://www.anthropic.com", None) + assert candidates == ["anthropic"] + + def test_max_three_candidates(self): + # Can't produce more than 3 + candidates = _slug_candidates("https://www.x.com", "X Corp Inc") + assert len(candidates) <= 3 + + +class TestLooseNameMatch: + def test_exact_match(self): + assert _loose_name_match("Anthropic", "Anthropic") is True + + def test_one_substring_of_other(self): + assert _loose_name_match("Acme", "Acme Corp Inc") is True + + def test_clear_mismatch(self): + assert _loose_name_match("Acme", "Globex") is False + + def test_empty_input_returns_true(self): + assert _loose_name_match("", "Acme") is True + + def test_empty_org_returns_true(self): + assert _loose_name_match("Acme", "") is True + + def test_case_insensitive(self): + assert _loose_name_match("ANTHROPIC", "anthropic") is True + + +# --------------------------------------------------------------------------- +# recover_via_slug_guess +# --------------------------------------------------------------------------- + + +class TestRecoverViaSlugGuess: + """All tests drive the real _fetch_* via a URL-dispatching fake request_with_retries.""" + + def _gh_resp(self, slug: str, company_name: str, count: int = 5) -> dict: + """Canned Greenhouse response with jobs.""" + return { + "jobs": [{"absolute_url": f"https://boards.greenhouse.io/{slug}/jobs/1", + "company_name": company_name}], + "meta": {"total": count}, + } + + def _gh_empty(self) -> dict: + return {"jobs": [], "meta": {"total": 0}} + + def _lever_resp(self, slug: str, count: int = 3) -> list: + return [{"hostedUrl": f"https://jobs.lever.co/{slug}/abc"}] * count + + def _ashby_resp(self, slug: str, count: int = 2) -> dict: + return {"jobs": [{"jobUrl": f"https://jobs.ashbyhq.com/{slug}/xyz"}] * count} + + def _ashby_empty(self) -> dict: + return {"jobs": []} + + def _lever_empty(self) -> list: + return [] + + # ----- Domain-stem hit (Anthropic-style) ----- + + def test_domain_stem_greenhouse_hit(self, monkeypatch): + """Greenhouse slug derived from domain stem → returns ATSResult.""" + def fake_req(client, method, url, **kw): + if "boards-api.greenhouse.io/v1/boards/anthropic" in url: + return FakeResponse(200, self._gh_resp("anthropic", "Anthropic", count=370)) + # All other probes empty + if "lever.co" in url: + return FakeResponse(200, self._lever_empty()) + if "ashby" in url: + return FakeResponse(200, self._ashby_empty()) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = recover_via_slug_guess( + "https://www.anthropic.com", "Anthropic", FakeClient() + ) + assert result is not None + assert result.ats_name == "greenhouse" + assert result.careers_url == "https://boards.greenhouse.io/anthropic" + assert result.position_url is not None + assert result.job_count == 370 + + # ----- Name-candidate fallback when stem misses ----- + + def test_name_candidate_fallback(self, monkeypatch): + """Stem misses; _slug(company_name) slug hits on Lever.""" + def fake_req(client, method, url, **kw): + # Stem slug "acmecorp" → greenhouse empty, lever empty, ashby empty + if "acmecorp" in url and "boards-api" in url: + return FakeResponse(200, self._gh_empty()) + if "acmecorp" in url and "lever" in url: + return FakeResponse(200, self._lever_empty()) + if "acmecorp" in url and "ashby" in url: + return FakeResponse(200, self._ashby_empty()) + # Name slug "acme" → lever hit + if "acme" in url and "lever" in url: + return FakeResponse(200, self._lever_resp("acme")) + if "acme" in url and "boards-api" in url: + return FakeResponse(200, self._gh_empty()) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = recover_via_slug_guess( + "https://www.acmecorp.com", "Acme", FakeClient() + ) + assert result is not None + assert result.ats_name == "lever" + assert "acme" in result.careers_url + + # ----- 0-jobs reject ----- + + def test_zero_jobs_rejected(self, monkeypatch): + """All slugs resolve but job_count==0 everywhere → None.""" + def fake_req(client, method, url, **kw): + if "boards-api" in url: + return FakeResponse(200, self._gh_empty()) + if "lever" in url: + return FakeResponse(200, self._lever_empty()) + if "ashby" in url: + return FakeResponse(200, self._ashby_empty()) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = recover_via_slug_guess( + "https://www.acme.com", "Acme", FakeClient() + ) + assert result is None + + # ----- Org-name mismatch reject (collision guard) ----- + + def test_org_name_mismatch_rejected(self, monkeypatch): + """Greenhouse returns jobs but org_name is a different company → skip to lever/ashby → miss.""" + def fake_req(client, method, url, **kw): + if "boards-api" in url: + # Returns jobs but for wrong company + return FakeResponse(200, self._gh_resp("acme", "Globex Corporation", count=10)) + if "lever" in url: + return FakeResponse(200, self._lever_empty()) + if "ashby" in url: + return FakeResponse(200, self._ashby_empty()) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = recover_via_slug_guess( + "https://www.acme.com", "Acme Corp", FakeClient() + ) + assert result is None + + # ----- Lever hit (no org_name) accepted on job_count alone ----- + + def test_lever_hit_without_org_name_accepted(self, monkeypatch): + """Lever doesn't expose org_name → cross-check is skipped; job_count>0 wins.""" + def fake_req(client, method, url, **kw): + if "boards-api" in url: + return FakeResponse(200, self._gh_empty()) + if "lever" in url: + return FakeResponse(200, self._lever_resp("acme")) + if "ashby" in url: + return FakeResponse(200, self._ashby_empty()) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = recover_via_slug_guess( + "https://www.acme.com", "Completely Different Name", FakeClient() + ) + # Lever returns jobs, no org_name → cross-check skipped → accepted + assert result is not None + assert result.ats_name == "lever" + + # ----- Short-circuit: greenhouse hit stops remaining probes ----- + + def test_short_circuits_on_first_hit(self, monkeypatch): + """Once Greenhouse hits, Lever and Ashby are NOT probed.""" + probed: list[str] = [] + + def fake_req(client, method, url, **kw): + probed.append(url) + if "boards-api.greenhouse.io/v1/boards/acme" in url: + return FakeResponse(200, self._gh_resp("acme", "Acme", count=5)) + if "lever" in url: + return FakeResponse(200, self._lever_resp("acme")) + if "ashby" in url: + return FakeResponse(200, self._ashby_resp("acme")) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = recover_via_slug_guess( + "https://www.acme.com", "Acme", FakeClient() + ) + assert result is not None + assert result.ats_name == "greenhouse" + # Lever and Ashby URLs must not have been probed + assert not any("lever" in u for u in probed) + assert not any("ashby" in u for u in probed) + + # ----- company_name=None: stem-only, cross-check skipped ----- + + def test_no_company_name_uses_stem_and_skips_crosscheck(self, monkeypatch): + """With company_name=None, use domain stem only; org_name cross-check skipped.""" + def fake_req(client, method, url, **kw): + if "boards-api.greenhouse.io/v1/boards/acme" in url: + return FakeResponse(200, self._gh_resp("acme", "Some Other Company")) + if "lever" in url: + return FakeResponse(200, self._lever_empty()) + if "ashby" in url: + return FakeResponse(200, self._ashby_empty()) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + # company_name=None → cross-check disabled even if org_name differs + result = recover_via_slug_guess( + "https://www.acme.com", None, FakeClient() + ) + assert result is not None + assert result.ats_name == "greenhouse" + + # ----- All-miss ----- + + def test_all_miss_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.ats.request_with_retries", + lambda *a, **kw: FakeResponse(404, {}), + ) + result = recover_via_slug_guess( + "https://www.nobody.com", "Nobody Inc", FakeClient() + ) + assert result is None + + # ----- Network error on one probe falls through ----- + + def test_single_probe_error_falls_through(self, monkeypatch): + """A probe that raises should not abort recovery; others are still tried.""" + call_count = [0] + + def fake_req(client, method, url, **kw): + call_count[0] += 1 + if "boards-api" in url: + raise RuntimeError("greenhouse down") + if "lever" in url: + return FakeResponse(200, self._lever_resp("acme")) + if "ashby" in url: + return FakeResponse(200, self._ashby_empty()) + return FakeResponse(404, {}) + + monkeypatch.setattr("jobsource.careers.ats.request_with_retries", fake_req) + result = recover_via_slug_guess( + "https://www.acme.com", None, FakeClient() + ) + assert result is not None + assert result.ats_name == "lever" diff --git a/tests/test_cascade.py b/tests/test_cascade.py new file mode 100644 index 0000000..bc3ef82 --- /dev/null +++ b/tests/test_cascade.py @@ -0,0 +1,552 @@ +"""Tests for jobsource/careers/cascade.py — all network-free via monkeypatching.""" +from __future__ import annotations + +import pytest + +from jobsource.careers import CareersResult, find_careers_page +from jobsource.careers.cascade import _detect_ats_in_page, _finalize, _safe_get_html +from jobsource.careers.ats import ATSBoard, ATSFetch, ATSResult + + +# --------------------------------------------------------------------------- +# Fake helpers +# --------------------------------------------------------------------------- + + +class FakeResponse: + def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"): + self.status_code = status_code + self.text = text + self.url = url + + +class FakeClient: + pass + + +# --------------------------------------------------------------------------- +# _detect_ats_in_page +# --------------------------------------------------------------------------- + + +class TestDetectATSInPage: + def test_returns_board_from_page_html(self, monkeypatch): + html = 'Jobs' + monkeypatch.setattr( + "jobsource.careers.cascade.request_with_retries", + lambda *a, **kw: FakeResponse(200, html), + ) + board = _detect_ats_in_page("https://vercel.com/careers", FakeClient()) + assert board is not None + assert board.ats_name == "greenhouse" + assert board.slug == "vercel" + + def test_returns_none_on_404(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.cascade.request_with_retries", + lambda *a, **kw: FakeResponse(404, ""), + ) + assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None + + def test_returns_none_on_exception(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.cascade.request_with_retries", + lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("timeout")), + ) + assert _detect_ats_in_page("https://vercel.com/careers", FakeClient()) is None + + def test_returns_none_when_no_ats_in_page(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.cascade.request_with_retries", + lambda *a, **kw: FakeResponse(200, "no ats here"), + ) + assert _detect_ats_in_page("https://acme.com/careers", FakeClient()) is None + + +# --------------------------------------------------------------------------- +# _safe_get_html +# --------------------------------------------------------------------------- + + +class TestSafeGetHtml: + def test_returns_text_on_200(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.cascade.request_with_retries", + lambda *a, **kw: FakeResponse(200, "page html"), + ) + result = _safe_get_html("https://acme.com", FakeClient()) + assert result == "page html" + + def test_returns_none_on_404(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.cascade.request_with_retries", + lambda *a, **kw: FakeResponse(404, ""), + ) + result = _safe_get_html("https://acme.com", FakeClient()) + assert result is None + + def test_returns_none_on_exception(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.cascade.request_with_retries", + lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("network error")), + ) + result = _safe_get_html("https://acme.com", FakeClient()) + assert result is None + + +# --------------------------------------------------------------------------- +# find_careers_page — tier ordering and early return +# --------------------------------------------------------------------------- + + +class TestCascadeTierOrdering: + def _patch_tiers(self, monkeypatch, *, ats=None, slug_guess=None, + url_pattern=None, homepage=None, sitemap=None, html=""): + monkeypatch.setattr( + "jobsource.careers.cascade._safe_get_html", + lambda website, client: html, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_and_fetch", + lambda *a, **kw: ats, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.recover_via_slug_guess", + lambda *a, **kw: slug_guess, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.probe_url_patterns", + lambda *a, **kw: url_pattern, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.scan_homepage_links", + lambda *a, **kw: homepage, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.parse_sitemap", + lambda *a, **kw: sitemap, + ) + # Also stub detect_ats_in_url so _finalize doesn't try to do network calls + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: None, + ) + + def test_ats_hit_returns_095_confidence(self, monkeypatch): + ats_result = ATSResult( + ats_name="greenhouse", + careers_url="https://boards.greenhouse.io/acme", + position_url="https://careers.acme.com/positions/1", + job_count=10, + ) + self._patch_tiers(monkeypatch, ats=ats_result) + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.careers_url == "https://boards.greenhouse.io/acme" + assert result.confidence == 0.95 + assert result.method == "ats:greenhouse" + assert result.ats_name == "greenhouse" + assert result.position_url == "https://careers.acme.com/positions/1" + + def test_url_pattern_hit_when_ats_misses(self, monkeypatch): + self._patch_tiers(monkeypatch, url_pattern="https://acme.com/careers") + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.careers_url == "https://acme.com/careers" + assert result.confidence == 0.80 + assert result.method == "url_pattern" + assert result.ats_name is None + + def test_homepage_scan_hit_when_ats_and_url_pattern_miss(self, monkeypatch): + self._patch_tiers(monkeypatch, homepage="https://acme.com/careers") + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.careers_url == "https://acme.com/careers" + assert result.confidence == 0.60 + assert result.method == "homepage_scan" + + def test_sitemap_hit_when_all_else_misses(self, monkeypatch): + self._patch_tiers(monkeypatch, sitemap="https://acme.com/careers") + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.careers_url == "https://acme.com/careers" + assert result.confidence == 0.50 + assert result.method == "sitemap" + + def test_all_miss_returns_none_method(self, monkeypatch): + self._patch_tiers(monkeypatch) + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.careers_url is None + assert result.confidence == 0.0 + assert result.method == "none" + + def test_ats_hit_skips_later_tiers(self, monkeypatch): + """When ATS resolves, slug_guess/url_pattern/homepage/sitemap should not be called.""" + ats_result = ATSResult( + ats_name="lever", careers_url="https://jobs.lever.co/acme", + ) + later_called: list[str] = [] + + monkeypatch.setattr( + "jobsource.careers.cascade._safe_get_html", + lambda *a, **kw: "", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_and_fetch", + lambda *a, **kw: ats_result, + ) + + def make_tracker(name): + def fn(*a, **kw): + later_called.append(name) + return None + return fn + + monkeypatch.setattr( + "jobsource.careers.cascade._ats.recover_via_slug_guess", + make_tracker("slug_guess"), + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.probe_url_patterns", + make_tracker("url_pattern"), + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.scan_homepage_links", + make_tracker("homepage_scan"), + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.parse_sitemap", + make_tracker("sitemap"), + ) + + find_careers_page("https://www.acme.com", client=FakeClient()) + assert later_called == [] + + def test_failing_tier_falls_through(self, monkeypatch): + """A tier that raises should not abort the cascade.""" + monkeypatch.setattr( + "jobsource.careers.cascade._safe_get_html", + lambda *a, **kw: "", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_and_fetch", + lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("ats exploded")), + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.recover_via_slug_guess", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.probe_url_patterns", + lambda *a, **kw: "https://acme.com/careers", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.scan_homepage_links", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.parse_sitemap", + lambda *a, **kw: None, + ) + + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.method == "url_pattern" + assert result.careers_url == "https://acme.com/careers" + + +# --------------------------------------------------------------------------- +# ATS-URL upgrade in _finalize +# --------------------------------------------------------------------------- + + +class TestFinalizeATSUpgrade: + def test_lever_url_upgrades_to_ats_lever(self, monkeypatch): + """When a heuristic finds a Lever URL, _finalize upgrades to ats:lever.""" + from jobsource.careers.ats import ATSBoard + + lever_board = ATSBoard( + ats_name="lever", + slug="acme", + careers_url="https://jobs.lever.co/acme", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: lever_board, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats._FETCH_DISPATCH", + {"lever": lambda board, client: ATSFetch(first_url="https://jobs.lever.co/acme/abc", job_count=5)}, + ) + + result = _finalize( + "https://jobs.lever.co/acme", "homepage_scan", 0.60, + "https://www.acme.com", FakeClient(), + ) + assert result.method == "ats:lever" + assert result.confidence == 0.95 + assert result.position_url == "https://jobs.lever.co/acme/abc" + assert result.ats_name == "lever" + + def test_non_ats_url_no_upgrade(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: None, + ) + result = _finalize( + "https://acme.com/careers", "url_pattern", 0.80, + "https://www.acme.com", FakeClient(), + ) + assert result.method == "url_pattern" + assert result.confidence == 0.80 + assert result.position_url is None + + def test_ats_upgrade_from_page_html(self, monkeypatch): + """URL pattern finds /careers; fetching that page reveals Greenhouse embed → upgrade.""" + gh_board = ATSBoard( + ats_name="greenhouse", + slug="vercel", + careers_url="https://boards.greenhouse.io/vercel", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: None, # URL string itself is not an ATS URL + ) + monkeypatch.setattr( + "jobsource.careers.cascade._detect_ats_in_page", + lambda url, client: gh_board, # page HTML reveals Greenhouse + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats._FETCH_DISPATCH", + {"greenhouse": lambda board, client: ATSFetch(first_url="https://job-boards.greenhouse.io/vercel/jobs/123", job_count=73)}, + ) + result = _finalize( + "https://vercel.com/careers", "url_pattern", 0.80, + "https://vercel.com", FakeClient(), + ) + assert result.method == "ats:greenhouse" + assert result.confidence == 0.95 + assert result.careers_url == "https://boards.greenhouse.io/vercel" + assert result.position_url == "https://job-boards.greenhouse.io/vercel/jobs/123" + assert result.ats_name == "greenhouse" + + def test_ats_upgrade_fetch_failure_falls_back_to_original(self, monkeypatch): + """If ATS fetch during upgrade fails, return the original heuristic result.""" + from jobsource.careers.ats import ATSBoard + + gh_board = ATSBoard( + ats_name="greenhouse", + slug="acme", + careers_url="https://boards.greenhouse.io/acme", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: gh_board, + ) + + def boom(board, client): + raise RuntimeError("api down") + + monkeypatch.setattr( + "jobsource.careers.cascade._ats._FETCH_DISPATCH", + {"greenhouse": boom}, + ) + result = _finalize( + "https://boards.greenhouse.io/acme", "sitemap", 0.50, + "https://www.acme.com", FakeClient(), + ) + # Upgrade failed — returns the original heuristic result + assert result.method == "sitemap" + assert result.confidence == 0.50 + + +# --------------------------------------------------------------------------- +# CareersResult model +# --------------------------------------------------------------------------- + + +class TestCareersResultModel: + def test_defaults(self): + r = CareersResult() + assert r.careers_url is None + assert r.confidence == 0.0 + assert r.method == "none" + assert r.ats_name is None + assert r.position_url is None + + def test_full(self): + r = CareersResult( + careers_url="https://boards.greenhouse.io/acme", + confidence=0.95, + method="ats:greenhouse", + ats_name="greenhouse", + position_url="https://careers.acme.com/positions/1", + ) + assert r.careers_url == "https://boards.greenhouse.io/acme" + assert r.ats_name == "greenhouse" + + +# --------------------------------------------------------------------------- +# Tier 1b — slug-guess in cascade ordering +# --------------------------------------------------------------------------- + + +class TestSlugGuessTier: + """Tests that slug-guess wires correctly into the cascade between HTML-ATS and url_pattern.""" + + def _patch_for_slug_guess(self, monkeypatch, *, slug_guess_result=None, + url_pattern=None): + """Patch cascade so HTML-ATS always misses; control slug_guess and url_pattern.""" + monkeypatch.setattr( + "jobsource.careers.cascade._safe_get_html", + lambda website, client: "", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_and_fetch", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.recover_via_slug_guess", + lambda *a, **kw: slug_guess_result, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.probe_url_patterns", + lambda *a, **kw: url_pattern, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.scan_homepage_links", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.parse_sitemap", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: None, + ) + + def test_slug_guess_hit_returns_090_confidence(self, monkeypatch): + """A slug-guess hit should produce confidence=0.90 and method=ats:{name}:slug_guess.""" + slug_result = ATSResult( + ats_name="greenhouse", + careers_url="https://boards.greenhouse.io/anthropic", + position_url="https://boards.greenhouse.io/anthropic/jobs/1", + job_count=370, + ) + self._patch_for_slug_guess(monkeypatch, slug_guess_result=slug_result) + result = find_careers_page( + "https://www.anthropic.com", + company_name="Anthropic", + client=FakeClient(), + ) + assert result.confidence == 0.90 + assert result.method == "ats:greenhouse:slug_guess" + assert result.careers_url == "https://boards.greenhouse.io/anthropic" + assert result.ats_name == "greenhouse" + assert result.position_url == "https://boards.greenhouse.io/anthropic/jobs/1" + + def test_slug_guess_hit_blocks_url_pattern(self, monkeypatch): + """If slug-guess hits, url_pattern should not be called.""" + url_pattern_called: list[bool] = [] + slug_result = ATSResult( + ats_name="lever", + careers_url="https://jobs.lever.co/acme", + position_url="https://jobs.lever.co/acme/xyz", + job_count=5, + ) + self._patch_for_slug_guess( + monkeypatch, slug_guess_result=slug_result, url_pattern="https://acme.com/careers" + ) + # Override url_pattern with a tracker + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.probe_url_patterns", + lambda *a, **kw: url_pattern_called.append(True) or None, + ) + find_careers_page("https://www.acme.com", client=FakeClient()) + assert url_pattern_called == [] + + def test_slug_guess_miss_falls_through_to_url_pattern(self, monkeypatch): + """When slug-guess misses, the cascade continues to url_pattern.""" + self._patch_for_slug_guess( + monkeypatch, + slug_guess_result=None, + url_pattern="https://acme.com/careers", + ) + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.method == "url_pattern" + assert result.confidence == 0.80 + + def test_company_name_forwarded_to_slug_guess(self, monkeypatch): + """company_name must be passed through to recover_via_slug_guess.""" + received: list[tuple] = [] + + def fake_recover(website, company_name, client): + received.append((website, company_name)) + return None + + monkeypatch.setattr( + "jobsource.careers.cascade._safe_get_html", + lambda *a, **kw: "", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_and_fetch", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.recover_via_slug_guess", + fake_recover, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.probe_url_patterns", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.scan_homepage_links", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.parse_sitemap", + lambda *a, **kw: None, + ) + + find_careers_page( + "https://www.anthropic.com", + company_name="Anthropic", + client=FakeClient(), + ) + assert len(received) == 1 + assert received[0] == ("https://www.anthropic.com", "Anthropic") + + def test_slug_guess_tier_error_falls_through(self, monkeypatch): + """A slug-guess tier that raises should not abort the cascade.""" + monkeypatch.setattr( + "jobsource.careers.cascade._safe_get_html", + lambda *a, **kw: "", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_and_fetch", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.recover_via_slug_guess", + lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("slug_guess exploded")), + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.probe_url_patterns", + lambda *a, **kw: "https://acme.com/careers", + ) + monkeypatch.setattr( + "jobsource.careers.cascade._ats.detect_ats_in_url", + lambda url: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.scan_homepage_links", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "jobsource.careers.cascade._heuristics.parse_sitemap", + lambda *a, **kw: None, + ) + result = find_careers_page("https://www.acme.com", client=FakeClient()) + assert result.method == "url_pattern" + assert result.careers_url == "https://acme.com/careers" diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py new file mode 100644 index 0000000..d029e28 --- /dev/null +++ b/tests/test_heuristics.py @@ -0,0 +1,425 @@ +"""Tests for jobsource/careers/heuristics.py — all network-free.""" +from __future__ import annotations + +import pytest + +from jobsource.careers.heuristics import ( + _base_parts, + _is_plausible_careers_url, + _score_anchor, + parse_sitemap, + probe_url_patterns, + scan_homepage_links, +) + + +# --------------------------------------------------------------------------- +# Tiny fake response helper +# --------------------------------------------------------------------------- + + +class FakeResponse: + def __init__(self, status_code: int, text: str = "", url: str = "https://example.com"): + self.status_code = status_code + self.text = text + self.url = url + + def json(self) -> object: + return {} + + +class FakeClient: + pass + + +# --------------------------------------------------------------------------- +# _is_plausible_careers_url +# --------------------------------------------------------------------------- + + +class TestIsPlausibleCareersUrl: + def test_same_domain_clean_path(self): + assert _is_plausible_careers_url("https://netflix.com/careers", "https://netflix.com/careers") is True + + def test_same_domain_deeper_path(self): + # Google /careers → /about/careers/applications/ + assert _is_plausible_careers_url( + "https://google.com/careers", + "https://google.com/about/careers/applications/", + ) is True + + def test_soft_404_notfound_rejected(self): + # Netflix SPA: /careers → /NotFound?prev=... + assert _is_plausible_careers_url( + "https://www.netflix.com/careers", + "https://www.netflix.com/NotFound?prev=https%3A%2F%2Fwww.netflix.com%2Fcareers", + ) is False + + def test_soft_404_slash404_rejected(self): + assert _is_plausible_careers_url( + "https://example.com/jobs", + "https://example.com/404", + ) is False + + def test_soft_404_not_found_hyphen_rejected(self): + assert _is_plausible_careers_url( + "https://example.com/jobs", + "https://example.com/not-found", + ) is False + + def test_off_brand_cross_domain_rejected(self): + # Microsoft /careers → bing.com + assert _is_plausible_careers_url( + "https://www.microsoft.com/careers", + "https://www.bing.com?ref=aka&shorturl=abc", + ) is False + + def test_on_brand_cross_domain_accepted(self): + # Amazon /careers → amazon.jobs + assert _is_plausible_careers_url( + "https://www.amazon.com/careers", + "https://amazon.jobs/en/", + ) is True + + def test_workday_subdomain_accepted(self): + # nvidia.com/careers → nvidia.wd5.myworkdayjobs.com/... + assert _is_plausible_careers_url( + "https://www.nvidia.com/careers", + "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite", + ) is True + + def test_short_brand_not_filtered(self): + # brand <= 3 chars: skip the off-brand check + assert _is_plausible_careers_url( + "https://ibm.com/careers", + "https://jobs.example.com/", + ) is True # "ibm" len=3 → no cross-domain filter applied + + +# --------------------------------------------------------------------------- +# _base_parts +# --------------------------------------------------------------------------- + + +class TestBaseParts: + def test_strips_www(self): + scheme, host, root = _base_parts("https://www.acme.com/about") + assert scheme == "https" + assert host == "www.acme.com" + assert root == "acme.com" + + def test_no_www(self): + _, _, root = _base_parts("https://acme.com") + assert root == "acme.com" + + def test_subdomain_preserved_in_host(self): + _, host, root = _base_parts("https://careers.acme.com/jobs") + assert host == "careers.acme.com" + assert root == "careers.acme.com" # www. stripping only + + def test_http_scheme(self): + scheme, _, _ = _base_parts("http://acme.com") + assert scheme == "http" + + +# --------------------------------------------------------------------------- +# _score_anchor +# --------------------------------------------------------------------------- + + +class TestScoreAnchor: + def test_careers_href_high_score(self): + assert _score_anchor("/careers", "") > 2.0 + + def test_jobs_href_score(self): + assert _score_anchor("/jobs", "") > 0 + + def test_text_careers_adds_score(self): + score_with = _score_anchor("/x", "Careers") + score_without = _score_anchor("/x", "About") + assert score_with > score_without + + def test_unrelated_href_and_text_zero(self): + assert _score_anchor("/about", "About us") == 0.0 + + def test_combined_href_and_text(self): + combined = _score_anchor("/careers", "Careers") + href_only = _score_anchor("/careers", "") + assert combined > href_only + + +# --------------------------------------------------------------------------- +# probe_url_patterns +# --------------------------------------------------------------------------- + + +class TestProbeUrlPatterns: + def test_returns_careers_path_on_hit(self, monkeypatch): + def fake_probe(client, url): + if url.endswith("/careers"): + return "https://acme.com/careers" + return None + + monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) + result = probe_url_patterns("https://acme.com", FakeClient()) + assert result == "https://acme.com/careers" + + def test_returns_none_when_all_miss(self, monkeypatch): + monkeypatch.setattr("jobsource.careers.heuristics.probe_url", lambda c, u: None) + result = probe_url_patterns("https://acme.com", FakeClient()) + assert result is None + + def test_returns_first_hit_not_second(self, monkeypatch): + hits = [] + + def fake_probe(client, url): + hits.append(url) + if url.endswith("/career"): + return "https://acme.com/career" + return None + + monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) + result = probe_url_patterns("https://acme.com", FakeClient()) + assert result == "https://acme.com/career" + # /careers was probed first and missed; /career was the first hit + assert any("/careers" in u for u in hits) + + def test_careers_subdomain_candidate_included(self, monkeypatch): + probed: list[str] = [] + + def fake_probe(client, url): + probed.append(url) + return None + + monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) + probe_url_patterns("https://www.acme.com", FakeClient()) + assert any("careers.acme.com" in u for u in probed) + assert any("jobs.acme.com" in u for u in probed) + + def test_jobs_subdomain_hit(self, monkeypatch): + def fake_probe(client, url): + if "jobs.acme.com" in url: + return "https://jobs.acme.com" + return None + + monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) + result = probe_url_patterns("https://www.acme.com", FakeClient()) + assert result == "https://jobs.acme.com" + + def test_soft_404_rejected_falls_through_to_none(self, monkeypatch): + # All candidates redirect to a NotFound page — should return None + monkeypatch.setattr( + "jobsource.careers.heuristics.probe_url", + lambda c, u: u.replace("/careers", "/NotFound").replace("/career", "/NotFound") + .replace("/jobs", "/NotFound").replace("/join-us", "/NotFound") + .replace("/join", "/NotFound"), + ) + result = probe_url_patterns("https://www.netflix.com", FakeClient()) + assert result is None + + def test_off_brand_redirect_rejected(self, monkeypatch): + # /careers redirects to an off-brand domain → skip; later candidate hits + def fake_probe(client, url): + if url.endswith("/careers"): + return "https://www.bing.com?ref=aka" + if url.endswith("/jobs"): + return "https://acme.com/jobs" + return None + + monkeypatch.setattr("jobsource.careers.heuristics.probe_url", fake_probe) + result = probe_url_patterns("https://www.acme.com", FakeClient()) + assert result == "https://acme.com/jobs" + + +# --------------------------------------------------------------------------- +# scan_homepage_links +# --------------------------------------------------------------------------- + + +HOMEPAGE_WITH_CAREERS = """ + + + +""" + +HOMEPAGE_NO_CAREER_LINKS = """ + + + +""" + +HOMEPAGE_RELATIVE_LINK = """ + +Open Jobs + +""" + +HOMEPAGE_ABSOLUTE_LEVER = """ + +Work with us + +""" + + +class TestScanHomepageLinks: + def test_finds_careers_link(self): + result = scan_homepage_links( + "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_WITH_CAREERS + ) + assert result is not None + assert "careers" in result + + def test_no_career_links_returns_none(self): + result = scan_homepage_links( + "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_NO_CAREER_LINKS + ) + assert result is None + + def test_relative_href_resolved(self): + result = scan_homepage_links( + "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_RELATIVE_LINK + ) + # "jobs" href + "Open Jobs" text should score above threshold + assert result is not None + assert result.startswith("https://acme.com") + + def test_absolute_external_link_preserved(self): + result = scan_homepage_links( + "https://acme.com", FakeClient(), homepage_html=HOMEPAGE_ABSOLUTE_LEVER + ) + assert result == "https://jobs.lever.co/acme" + + def test_empty_html_returns_none(self): + result = scan_homepage_links( + "https://acme.com", FakeClient(), homepage_html="" + ) + assert result is None + + def test_fetch_failure_returns_none(self, monkeypatch): + def boom(*a, **kw): + raise RuntimeError("connection refused") + monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom) + result = scan_homepage_links("https://acme.com", FakeClient()) + assert result is None + + def test_http_error_status_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.heuristics.request_with_retries", + lambda *a, **kw: FakeResponse(500, "error"), + ) + result = scan_homepage_links("https://acme.com", FakeClient()) + assert result is None + + def test_skips_mailto_links(self): + html = 'Email us' + result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html) + assert result is None + + def test_skips_fragment_only_links(self): + html = 'Careers' + result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html) + assert result is None + + def test_prefers_href_careers_over_unrelated_text(self): + # /careers in href scores; /about with neutral text should score lower. + html = """ + + Company information + Some random text + + """ + result = scan_homepage_links("https://acme.com", FakeClient(), homepage_html=html) + assert result is not None + assert "careers" in result + + +# --------------------------------------------------------------------------- +# parse_sitemap +# --------------------------------------------------------------------------- + + +SIMPLE_SITEMAP = """ + + https://acme.com/about + https://acme.com/careers + https://acme.com/blog +""" + +SITEMAP_NO_CAREERS = """ + + https://acme.com/about + https://acme.com/blog +""" + +SITEMAP_INDEX = """ + + https://acme.com/sitemap-pages.xml + https://acme.com/sitemap-jobs.xml +""" + +SITEMAP_JOBS_CHILD = """ + + https://acme.com/jobs/senior-engineer +""" + +SITEMAP_PAGES_CHILD = """ + + https://acme.com/about +""" + + +class TestParseSitemap: + def test_finds_careers_url(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.heuristics.request_with_retries", + lambda *a, **kw: FakeResponse(200, SIMPLE_SITEMAP, "https://acme.com/sitemap.xml"), + ) + result = parse_sitemap("https://acme.com", FakeClient()) + assert result == "https://acme.com/careers" + + def test_no_careers_url_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.heuristics.request_with_retries", + lambda *a, **kw: FakeResponse(200, SITEMAP_NO_CAREERS), + ) + result = parse_sitemap("https://acme.com", FakeClient()) + assert result is None + + def test_404_returns_none(self, monkeypatch): + monkeypatch.setattr( + "jobsource.careers.heuristics.request_with_retries", + lambda *a, **kw: FakeResponse(404, ""), + ) + result = parse_sitemap("https://acme.com", FakeClient()) + assert result is None + + def test_network_error_returns_none(self, monkeypatch): + def boom(*a, **kw): + raise RuntimeError("timeout") + monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", boom) + result = parse_sitemap("https://acme.com", FakeClient()) + assert result is None + + def test_sitemap_index_fetches_children(self, monkeypatch): + responses = { + "https://acme.com/sitemap.xml": FakeResponse(200, SITEMAP_INDEX), + "https://acme.com/sitemap-pages.xml": FakeResponse(200, SITEMAP_PAGES_CHILD), + "https://acme.com/sitemap-jobs.xml": FakeResponse(200, SITEMAP_JOBS_CHILD), + } + + def fake_req(client, method, url, **kw): + # Strip query params for lookup + base_url = url.split("?")[0] + return responses.get(base_url, FakeResponse(404, "")) + + monkeypatch.setattr("jobsource.careers.heuristics.request_with_retries", fake_req) + result = parse_sitemap("https://acme.com", FakeClient()) + assert result == "https://acme.com/jobs/senior-engineer" diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 2ce4422..b562fc4 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -102,7 +102,9 @@ def test_job_result_is_complete() -> None: def test_settings_load_defaults() -> None: from jobsource.config import Settings - s = Settings() + # _env_file=None suppresses .env loading for this instance so we see the + # coded defaults, not whatever the operator has set in the real .env file. + s = Settings(_env_file=None) assert s.job_source == "jobspy" assert s.batch_size == 20 assert s.hours_old == 72