scaffold

2026-06-17 08:38:15 -04:00
commit f13b8fc1ca
28 changed files with 894 additions and 0 deletions
--- a/jobsource/init.py
+++ b/jobsource/init.py
@@ -0,0 +1,3 @@
+"""AI Job Source Agent package."""
+
+__version__ = "0.1.0"
--- a/jobsource/agent_fallback.py
+++ b/jobsource/agent_fallback.py
@@ -0,0 +1,11 @@
+"""Browser Use fused fallback: find careers page AND extract one job URL in one session.
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 2/3 last resort): implement per CLAUDE.md "Stage 2 — tier 6" and "Stage 3 — tier 5".
+# This is the LAST tier of the cascade. Fires only when all cheaper tiers in cascade.py
+# and extract.py have failed. One Browser Use agent session does both:
+#   1. Navigate to the company website and locate the careers/jobs page.
+#   2. From the careers page, return the URL of one open position.
+# Graceful degradation: if Browser Use / Playwright / LLM key are unavailable, log clearly
+# and return (careers_url=None, position_url=None) so the pipeline records needs_review.
--- a/jobsource/careers/init.py
+++ b/jobsource/careers/init.py
@@ -0,0 +1 @@
+"""Careers page discovery sub-package (Stage 2 cascade)."""
--- a/jobsource/careers/ats.py
+++ b/jobsource/careers/ats.py
@@ -0,0 +1,17 @@
+"""ATS detection and public JSON API fetching (Stage 2, tier 1).
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 2, tier 1): implement per CLAUDE.md "Stage 2 — ATS detection".
+# Detect Greenhouse / Lever / Ashby / Workday from the company website, then call
+# their public JSON APIs (no login needed). On success, return both the careers page URL
+# AND the first job posting URL (so Stage 3 can skip its own cascade for ATS companies).
+#
+# Confirmed ATS JSON field shapes (verify live before trusting — see CLAUDE.md Gotchas):
+#   Greenhouse: GET https://boards-api.greenhouse.io/v1/boards/{slug}/jobs
+#               → {"jobs": [{"absolute_url": "...", ...}, ...]}
+#   Lever:      GET https://api.lever.co/v0/postings/{company}?mode=json
+#               → [{"hostedUrl": "...", ...}, ...]
+#   Ashby:      POST https://api.ashbyhq.com/posting-api/job-board/{slug}
+#               → {"jobs": [{"jobUrl": "...", ...}, ...]}
+#   Workday:    varies by tenant — needs per-tenant discovery logic
--- a/jobsource/careers/cascade.py
+++ b/jobsource/careers/cascade.py
@@ -0,0 +1,13 @@
+"""find_careers_page(): orchestrate the Stage 2 tier cascade.
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 2): implement per CLAUDE.md "Stage 2 — Find careers page (cascade, return on first hit)".
+# Cascade order (return early on first success):
+#   1. ATS detection  → ats.detect_and_fetch()
+#   2. URL patterns   → heuristics.probe_url_patterns()
+#   3. Homepage scan  → heuristics.scan_homepage_links()
+#   4. Sitemap        → heuristics.parse_sitemap()
+#   5. Cheap-LLM      → classify_llm.classify_careers_link()
+#   6. Browser agent  → agent_fallback.run_fused_agent()  (also handles Stage 3)
+# Returns (careers_url: str | None, method: str, ats_name: str | None).
--- a/jobsource/careers/classify_llm.py
+++ b/jobsource/careers/classify_llm.py
@@ -0,0 +1,13 @@
+"""Cheap-LLM link classification for careers page and job links (Stage 2, tier 5 / Stage 3, tier 4).
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 2 tier 5 / Stage 3 tier 4): implement per CLAUDE.md "Cheap-LLM classification".
+# Uses Pydantic AI (model-agnostic) with the `classifier_model` from config.
+# Two typed tasks:
+#   1. classify_careers_link(anchors: list[Anchor]) -> CareerLinkResult
+#      Given extracted <a> tags from a page, pick the careers/jobs page URL.
+#   2. classify_job_link(anchors: list[Anchor]) -> JobLinkResult
+#      Given extracted <a> tags from a careers page, pick one open-position URL.
+# Both return a typed Pydantic result including the chosen URL and confidence.
+# Graceful degradation: if llm_api_key is placeholder or call fails, return None.
--- a/jobsource/careers/heuristics.py
+++ b/jobsource/careers/heuristics.py
@@ -0,0 +1,11 @@
+"""Deterministic careers-page heuristics: URL probing, homepage scan, sitemap (Stage 2, tiers 2–4).
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 2, tiers 2–4): implement per CLAUDE.md "Stage 2 — URL patterns / homepage / sitemap".
+# Tier 2 — URL patterns: probe /careers, /career, /jobs, /join-us, /join,
+#           careers.{domain}, jobs.{domain} via HTTP HEAD (or GET if HEAD fails).
+# Tier 3 — Homepage link scan: fetch homepage HTML, parse with BeautifulSoup + lxml,
+#           rank <a> anchors by career/job keywords in href/text, return highest-ranked.
+# Tier 4 — Sitemap: fetch sitemap.xml (and sitemap index if present), scan for career/job URLs.
+# Each function returns (url: str | None) so cascade.py can return early on first hit.
--- a/jobsource/config.py
+++ b/jobsource/config.py
@@ -0,0 +1,64 @@
+"""Application configuration, loaded from the environment via pydantic-settings.
+
+Every setting is env-driven. Model identifiers and API keys are read from the
+environment with inert placeholder defaults — the operator supplies real values
+in `.env`. Never hardcode real model IDs or secrets in this file.
+"""
+from __future__ import annotations
+
+from functools import lru_cache
+from pathlib import Path
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+        case_sensitive=False,
+    )
+
+    # -- Job source / ingestion --------------------------------------------
+    job_source: str = Field(default="jobspy", description="Ingestion provider: 'jobspy' | 'apify'.")
+    search_terms: list[str] = Field(default_factory=lambda: ["software engineer"])
+    location: str = "United States"
+    hours_old: int = 72
+    batch_size: int = 20
+    results_wanted: int = 50
+
+    # -- Apify (only used when job_source == 'apify') ----------------------
+    apify_token: str = "PLACEHOLDER_APIFY_TOKEN"
+    apify_actor: str = "PLACEHOLDER_APIFY_ACTOR"
+
+    # -- Website resolution (optional search API) --------------------------
+    search_api_enabled: bool = False
+    search_api_key: str = "PLACEHOLDER_SEARCH_API_KEY"
+
+    # -- LLM / agent models (placeholders -- set real IDs in .env) ---------
+    # NEVER hardcode real model identifiers. These are inert placeholders.
+    llm_api_key: str = "PLACEHOLDER_LLM_API_KEY"
+    classifier_model: str = "PLACEHOLDER_CLASSIFIER_MODEL"   # cheap model: link classification
+    agent_model: str = "PLACEHOLDER_AGENT_MODEL"             # stronger model: browser agent
+
+    # -- HTTP client -------------------------------------------------------
+    http_timeout: float = 20.0
+    http_max_retries: int = 3
+    http_backoff_factor: float = 0.5
+    user_agent: str = "JobSourceAgent/0.1 (+https://example.com)"
+
+    # -- Storage / output --------------------------------------------------
+    db_path: Path = Path("output/jobsource.db")
+    output_csv: Path = Path("output/results.csv")
+
+    # -- Browser agent (fallback tier) -------------------------------------
+    enable_browser_agent: bool = True
+    browser_headless: bool = True
+
+
+@lru_cache
+def get_settings() -> Settings:
+    """Return the cached Settings singleton (call get_settings.cache_clear() in tests)."""
+    return Settings()
--- a/jobsource/db.py
+++ b/jobsource/db.py
@@ -0,0 +1,10 @@
+"""SQLite persistence layer: companies table, jobs table, dedup, company cache, CSV export.
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 4): implement per CLAUDE.md "Stage 4 — Persist & export" and "Data model".
+# Schema:
+#   companies(company_key PK, name, website, career_url, first_seen)
+#   jobs(job_id PK, company_key, linkedin_url, position_url, status, listed_at, first_seen)
+# CSV export writes output/results.csv with columns: company_name, career_page_url, open_position_url
+# (complete rows — status==position_found — sorted first; incomplete rows follow).
--- a/jobsource/extract.py
+++ b/jobsource/extract.py
@@ -0,0 +1,12 @@
+"""Extract one open position URL from a careers page (Stage 3).
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 3): implement per CLAUDE.md "Stage 3 — Extract one open position (return on first hit)".
+# Cascade order (return early on first hit):
+#   1. ATS JSON — if ATS is already known from Stage 2, return first posting URL directly.
+#   2. JobPosting JSON-LD — parse application/ld+json for a `url` field.
+#   3. Job-like anchors — first <a> matching /job, /position, /opening, /vacancy in href.
+#   4. Cheap-LLM classification — Pydantic AI typed output (classifier_model).
+#   5. Browser-agent fallback — handled inside the fused Stage-2 agent call in agent_fallback.py.
+# Returns (url: str | None, method: str) so callers know which tier resolved it.
--- a/jobsource/flow.py
+++ b/jobsource/flow.py
@@ -0,0 +1,10 @@
+"""Prefect flow definition and interval schedule.
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (scheduling): implement per CLAUDE.md "Orchestration/scheduling: Prefect".
+# Wrap run_batch() in a @flow with:
+#   - Retries on the flow level.
+#   - An interval schedule (configurable; default daily).
+# Run with: python -m jobsource.flow
+# Cron fallback (no daemon): */0 6 * * * cd <repo> && ./.venv/bin/python -m jobsource.main --batch-size 50
--- a/jobsource/http.py
+++ b/jobsource/http.py
@@ -0,0 +1,97 @@
+"""Shared httpx client factory and a small bounded-retry helper.
+
+Every outbound HTTP call in the pipeline should go through a client built here
+so timeouts, headers, and bounded retries are applied consistently. Connection-
+level retries are handled by the transport; request_with_retries adds bounded
+retries for transient HTTP status codes.
+"""
+from __future__ import annotations
+
+import logging
+import time
+from collections.abc import Iterable
+
+import httpx
+
+from .config import get_settings
+
+logger = logging.getLogger(__name__)
+
+_RETRY_STATUS = frozenset({429, 500, 502, 503, 504})
+
+
+def default_headers() -> dict[str, str]:
+    settings = get_settings()
+    return {
+        "User-Agent": settings.user_agent,
+        "Accept": "text/html,application/xhtml+xml,application/json;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.9",
+    }
+
+
+def build_client(**overrides: object) -> httpx.Client:
+    """Create a configured sync httpx client.
+
+    Timeout and connection-level retries come from settings; callers may pass
+    httpx.Client kwargs as overrides (e.g. base_url, extra headers).
+    """
+    settings = get_settings()
+    kwargs: dict[str, object] = {
+        "timeout": httpx.Timeout(settings.http_timeout),
+        "headers": default_headers(),
+        "follow_redirects": True,
+        "transport": httpx.HTTPTransport(retries=settings.http_max_retries),
+    }
+    kwargs.update(overrides)
+    return httpx.Client(**kwargs)  # type: ignore[arg-type]
+
+
+def request_with_retries(
+    client: httpx.Client,
+    method: str,
+    url: str,
+    *,
+    max_retries: int | None = None,
+    retry_status: Iterable[int] = _RETRY_STATUS,
+    **kwargs: object,
+) -> httpx.Response:
+    """Issue a request, retrying on transient status codes with exponential backoff."""
+    settings = get_settings()
+    retries = settings.http_max_retries if max_retries is None else max_retries
+    backoff = settings.http_backoff_factor
+    statuses = frozenset(retry_status)
+    last_exc: Exception | None = None
+    for attempt in range(retries + 1):
+        try:
+            response = client.request(method, url, **kwargs)  # type: ignore[arg-type]
+            if response.status_code in statuses and attempt < retries:
+                sleep_for = backoff * (2**attempt)
+                logger.warning(
+                    "HTTP %s on %s (attempt %d/%d); retrying in %.1fs",
+                    response.status_code,
+                    url,
+                    attempt + 1,
+                    retries,
+                    sleep_for,
+                )
+                time.sleep(sleep_for)
+                continue
+            return response
+        except httpx.HTTPError as exc:
+            last_exc = exc
+            if attempt < retries:
+                sleep_for = backoff * (2**attempt)
+                logger.warning(
+                    "HTTP error on %s (attempt %d/%d): %s; retrying in %.1fs",
+                    url,
+                    attempt + 1,
+                    retries,
+                    exc,
+                    sleep_for,
+                )
+                time.sleep(sleep_for)
+                continue
+            raise
+    if last_exc is not None:  # pragma: no cover - defensive
+        raise last_exc
+    raise RuntimeError("request_with_retries exhausted without a response")
--- a/jobsource/main.py
+++ b/jobsource/main.py
@@ -0,0 +1,55 @@
+"""CLI entry point: `python -m jobsource.main`.
+
+Scaffold stub. Argument parsing is wired so `--help` works; the actual batch
+run lands in a later step (see jobsource/pipeline.py). Imports only stdlib so
+`--help` works before the heavier dependencies are installed.
+"""
+from __future__ import annotations
+
+import argparse
+import sys
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="python -m jobsource.main",
+        description=(
+            "AI Job Source Agent -- emit company_name, career_page_url, "
+            "open_position_url for recently posted LinkedIn jobs."
+        ),
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=None,
+        help="Number of new jobs to process this run (default from config).",
+    )
+    parser.add_argument(
+        "--search",
+        action="append",
+        metavar="TERM",
+        help="Search term; repeatable. Overrides config search terms.",
+    )
+    parser.add_argument(
+        "--location",
+        default=None,
+        help="Job location filter (default from config).",
+    )
+    parser.add_argument(
+        "--hours-old",
+        type=int,
+        default=None,
+        help="Only jobs posted within this many hours (default from config).",
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+    print("jobsource: scaffold stub -- pipeline not implemented yet.", file=sys.stderr)
+    print(f"parsed args: {vars(args)}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/jobsource/models.py
+++ b/jobsource/models.py
@@ -0,0 +1,88 @@
+"""Pydantic data models shared across the pipeline.
+
+RawJob is the normalized output of any job source (Stage 1). JobResult is the
+per-job record that flows through the cascade and becomes one CSV row. The CSV
+contract is exactly three columns: company_name, career_page_url,
+open_position_url.
+"""
+from __future__ import annotations
+
+from datetime import datetime
+from enum import Enum
+
+from pydantic import BaseModel, Field
+
+
+class JobStatus(str, Enum):
+    """Lifecycle of a single job record. Complete == position_found."""
+
+    new = "new"
+    website_resolved = "website_resolved"
+    careers_found = "careers_found"
+    position_found = "position_found"
+    failed = "failed"
+    needs_review = "needs_review"
+
+
+class RawJob(BaseModel):
+    """Normalized job posting from a source provider (Stage 1 output)."""
+
+    job_id: str = Field(..., description="LinkedIn numeric jobPostingId, parsed from the job URL.")
+    company: str = Field(..., description="Company name as reported by the source.")
+    linkedin_url: str = Field(..., description="Canonical LinkedIn job-view URL.")
+    website: str | None = Field(default=None, description="Company's own site, if provided.")
+    listed_at: datetime | None = Field(default=None, description="When the job was posted, if known.")
+    title: str | None = Field(default=None, description="Job title, if provided.")
+    location: str | None = Field(default=None, description="Job location, if provided.")
+
+
+class JobResult(BaseModel):
+    """Per-job record carried through the cascade; serializes to one CSV row."""
+
+    job_id: str
+    company_name: str
+    company_key: str | None = Field(
+        default=None, description="Normalized domain, else lowercased name."
+    )
+    website: str | None = None
+    career_page_url: str | None = None
+    open_position_url: str | None = None
+    status: JobStatus = JobStatus.new
+    linkedin_url: str | None = None
+    listed_at: datetime | None = None
+    title: str | None = None
+    location: str | None = None
+    # Observability: which cascade tier/method resolved each stage.
+    careers_method: str | None = None
+    position_method: str | None = None
+
+    @property
+    def is_complete(self) -> bool:
+        """A record is complete once an open position has been found."""
+        return self.status == JobStatus.position_found
+
+    @classmethod
+    def from_raw(cls, raw: RawJob) -> "JobResult":
+        """Seed a result from a raw job (status starts at `new`)."""
+        return cls(
+            job_id=raw.job_id,
+            company_name=raw.company,
+            website=raw.website,
+            linkedin_url=raw.linkedin_url,
+            listed_at=raw.listed_at,
+            title=raw.title,
+            location=raw.location,
+            status=JobStatus.new,
+        )
+
+    def to_csv_row(self) -> dict[str, str]:
+        """Return exactly the three contract columns (empty string for None)."""
+        return {
+            "company_name": self.company_name or "",
+            "career_page_url": self.career_page_url or "",
+            "open_position_url": self.open_position_url or "",
+        }
+
+
+# The CSV output contract — exactly these columns, in this order.
+CSV_COLUMNS: tuple[str, str, str] = ("company_name", "career_page_url", "open_position_url")
--- a/jobsource/pipeline.py
+++ b/jobsource/pipeline.py
@@ -0,0 +1,12 @@
+"""Batch orchestration: dedup, per-record isolation, cascade, persistence, summary.
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (pipeline): implement run_batch() per CLAUDE.md "Pipeline stages".
+# run_batch() contract:
+#   - Accept batch_size, search terms, location, hours_old overrides.
+#   - Call the job source, dedup by job_id against the DB (skip already-seen jobs).
+#   - For each new RawJob, run the full cascade (resolve -> careers -> extract) in isolation:
+#     one failing record must NEVER abort the batch — catch, record failed/needs_review, continue.
+#   - Persist each JobResult to the DB and export output/results.csv when done.
+#   - Print a run summary: per-stage counts + % of new jobs reaching position_found.
--- a/jobsource/resolve.py
+++ b/jobsource/resolve.py
@@ -0,0 +1,10 @@
+"""Resolve company name → company website URL (Stage 1b, deterministic).
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 1b): implement per CLAUDE.md "Stage 1b — Resolve website (deterministic)".
+# Resolution order:
+#   1. Use provider-supplied website if present.
+#   2. Verified domain guess: normalize company name to {slug}.com and probe via HTTP HEAD.
+#   3. Optional search API (SEARCH_API_ENABLED=true) as final fallback.
+# Returns the resolved URL string, or None if unresolvable.
--- a/jobsource/sources/init.py
+++ b/jobsource/sources/init.py
@@ -0,0 +1 @@
+"""Job source provider package."""
--- a/jobsource/sources/apify_source.py
+++ b/jobsource/sources/apify_source.py
@@ -0,0 +1,8 @@
+"""Apify ingestion provider (alternative, paid) — implements JobSource.
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 1): implement ApifySource per CLAUDE.md "Stage 1 — Ingest".
+# Drop-in alternative to JobSpySource; same JobSource interface.
+# Uses apify-client; actor ID from config (APIFY_ACTOR env var).
+# Map Apify actor output fields → RawJob; same dedup key (LinkedIn jobPostingId).
--- a/jobsource/sources/base.py
+++ b/jobsource/sources/base.py
@@ -0,0 +1,16 @@
+"""JobSource interface: every ingestion provider must implement fetch_recent_jobs().
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 1): define the JobSource ABC per CLAUDE.md "Stage 1 — Ingest (deterministic)".
+# Interface:
+#   class JobSource(ABC):
+#       @abstractmethod
+#       def fetch_recent_jobs(
+#           self,
+#           search_terms: list[str],
+#           location: str,
+#           hours_old: int,
+#           results_wanted: int,
+#       ) -> list[RawJob]: ...
+# Implementations: jobspy_source.JobSpySource, apify_source.ApifySource.
--- a/jobsource/sources/jobspy_source.py
+++ b/jobsource/sources/jobspy_source.py
@@ -0,0 +1,10 @@
+"""JobSpy ingestion provider (default, free) — implements JobSource.
+
+Scaffold stub -- not implemented yet.
+"""
+# TODO (Stage 1): implement JobSpySource per CLAUDE.md "Stage 1 — Ingest".
+# Uses python-jobspy (python_jobspy). Key notes:
+#   - Search LinkedIn via JobSpy; parse LinkedIn numeric jobPostingId from the job URL.
+#   - Map JobSpy result fields → RawJob (company, website from company_url_direct if present).
+#   - Strip tracking query params from linkedin_url; keep only /jobs/view/{id}.
+#   - Log observed fill rate of company_url_direct (see CLAUDE.md Gotchas).
				`@@ -0,0 +1 @@`
				`"""Careers page discovery sub-package (Stage 2 cascade)."""`