phase1-ingest-resolve

2026-06-17 13:59:00 -04:00
parent f13b8fc1ca
commit cd9ab9b95e
8 changed files with 958 additions and 35 deletions
--- a/jobsource/resolve.py
+++ b/jobsource/resolve.py
@@ -1,10 +1,122 @@
 """Resolve company name → company website URL (Stage 1b, deterministic).

-Scaffold stub -- not implemented yet.
+Three-tier cascade — returns on first hit:
+  Tier 1: provider-supplied website (trusted, no network call).
+  Tier 2: verified {slug}.com guess (HTTP HEAD/GET probe).
+  Tier 3: search API (gated by SEARCH_API_ENABLED; ships as a stub — wire
+          a real provider in _search_api_lookup() when ready).
 """
-# TODO (Stage 1b): implement per CLAUDE.md "Stage 1b — Resolve website (deterministic)".
-# Resolution order:
-#   1. Use provider-supplied website if present.
-#   2. Verified domain guess: normalize company name to {slug}.com and probe via HTTP HEAD.
-#   3. Optional search API (SEARCH_API_ENABLED=true) as final fallback.
-# Returns the resolved URL string, or None if unresolvable.
+from __future__ import annotations
+
+import logging
+import re
+
+import httpx
+
+from .config import get_settings
+from .http import build_client, request_with_retries
+
+logger = logging.getLogger(__name__)
+
+# Legal suffix words stripped when building the domain slug.
+_LEGAL_SUFFIXES = re.compile(
+    r"\b(inc|llc|ltd|corp|co|gmbh|plc|sa|ag|pbc|lp|llp)\b",
+    re.IGNORECASE,
+)
+_NON_ALNUM = re.compile(r"[^a-z0-9]+")
+
+# HEAD responses that indicate the server doesn't support HEAD — retry with GET.
+_HEAD_NOT_SUPPORTED = frozenset({405, 501})
+
+
+def resolve_website(
+    company_name: str,
+    website: str | None = None,
+    *,
+    client: httpx.Client | None = None,
+) -> str | None:
+    """Return the company's own website URL, or None if unresolvable.
+
+    Pass an existing httpx.Client to reuse connections across many calls;
+    otherwise a short-lived client is created and closed here.
+    """
+    settings = get_settings()
+    _managed = client is None
+    if _managed:
+        client = build_client()
+
+    try:
+        # Tier 1 — provider-supplied website (trusted, no network needed).
+        if website and not website.startswith("PLACEHOLDER"):
+            resolved = _normalize_scheme(website)
+            logger.info("resolve(%s): tier=provider url=%s", company_name, resolved)
+            return resolved
+
+        # Tier 2 — {slug}.com guess with HTTP verification.
+        slug = _slug(company_name)
+        if slug:
+            guessed = f"https://{slug}.com"
+            verified = _verify(client, guessed)
+            if verified:
+                logger.info("resolve(%s): tier=slug_guess url=%s", company_name, verified)
+                return verified
+
+        # Tier 3 — optional search API (gated; stub by default).
+        if settings.search_api_enabled and not settings.search_api_key.startswith("PLACEHOLDER"):
+            result = _search_api_lookup(company_name, client)
+            if result:
+                logger.info("resolve(%s): tier=search_api url=%s", company_name, result)
+                return result
+
+        logger.info("resolve(%s): unresolvable (all tiers missed)", company_name)
+        return None
+    finally:
+        if _managed:
+            client.close()
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _normalize_scheme(url: str) -> str:
+    """Add https:// if the URL has no scheme."""
+    if url.startswith(("http://", "https://")):
+        return url
+    return f"https://{url}"
+
+
+def _slug(name: str) -> str | None:
+    """Normalize company name to a domain slug (lowercase, no legal suffixes, alnum only)."""
+    s = name.lower()
+    s = _LEGAL_SUFFIXES.sub("", s)
+    s = _NON_ALNUM.sub("", s)
+    return s or None
+
+
+def _verify(client: httpx.Client, url: str) -> str | None:
+    """Probe url with HEAD (fallback to GET on 405/501); return final URL or None."""
+    try:
+        resp = request_with_retries(client, "HEAD", url, max_retries=1)
+        if resp.status_code in _HEAD_NOT_SUPPORTED:
+            resp = request_with_retries(client, "GET", url, max_retries=1)
+        if resp.status_code < 400:
+            return str(resp.url)
+        return None
+    except Exception:
+        return None
+
+
+def _search_api_lookup(company_name: str, client: httpx.Client) -> str | None:
+    """Search API fallback — returns the company's website from a web search.
+
+    Integration point: implement by querying a search API with
+    f'"{company_name}" official website' using settings.search_api_key,
+    extracting the registrable domain from the top organic result's URL,
+    verifying it with _verify(), and returning the URL or None.
+
+    Currently a stub; enabled only when SEARCH_API_ENABLED=true and a real
+    SEARCH_API_KEY is set.
+    """
+    return None
--- a/jobsource/sources/init.py
+++ b/jobsource/sources/init.py
@@ -1 +1,38 @@
-"""Job source provider package."""
+"""Job source provider package.
+
+Public API: JobSource interface + get_job_source() factory.
+Heavy provider imports are deferred to get_job_source() so loading this
+package does not pull in pandas, jobspy, or apify-client at import time.
+"""
+from __future__ import annotations
+
+from .base import JobSource
+
+__all__ = ["JobSource", "get_job_source"]
+
+
+def get_job_source(settings=None) -> JobSource:
+    """Return the configured JobSource instance.
+
+    Reads job_source from settings (default: get_settings()). Lazy-imports
+    the chosen module so the unused provider's heavy deps are never loaded.
+
+    Raises ValueError for unknown provider names.
+    """
+    if settings is None:
+        from ..config import get_settings
+
+        settings = get_settings()
+
+    provider = settings.job_source.lower()
+    if provider == "jobspy":
+        from .jobspy_source import JobSpySource
+
+        return JobSpySource()
+    if provider == "apify":
+        from .apify_source import ApifySource
+
+        return ApifySource()
+    raise ValueError(
+        f"Unknown job_source {settings.job_source!r}. Valid values: 'jobspy', 'apify'."
+    )
--- a/jobsource/sources/apify_source.py
+++ b/jobsource/sources/apify_source.py
@@ -1,8 +1,129 @@
 """Apify ingestion provider (alternative, paid) — implements JobSource.

-Scaffold stub -- not implemented yet.
+Drop-in replacement for JobSpySource; same interface, selected by config
+(JOB_SOURCE=apify). Two adjustment points when swapping actors:
+  1. _run_actor() — the one-line Apify SDK call + actor ID from config.
+  2. _to_raw_job() — the field-alias map (actor output schema varies).
 """
-# TODO (Stage 1): implement ApifySource per CLAUDE.md "Stage 1 — Ingest".
-# Drop-in alternative to JobSpySource; same JobSource interface.
-# Uses apify-client; actor ID from config (APIFY_ACTOR env var).
-# Map Apify actor output fields → RawJob; same dedup key (LinkedIn jobPostingId).
+from __future__ import annotations
+
+import logging
+
+from ..config import get_settings
+from ..models import RawJob
+from .base import JobSource, canonical_linkedin_url, clean_value, parse_linkedin_job_id
+
+logger = logging.getLogger(__name__)
+
+# Field aliases tried in order when mapping actor output → RawJob.
+# Extend this list when the actor schema is known; first match wins.
+_COMPANY_KEYS = ("company", "companyName", "company_name")
+_URL_KEYS = ("jobUrl", "job_url", "url", "link", "applyUrl")
+_WEBSITE_KEYS = ("companyWebsite", "website", "company_url_direct", "companyUrl")
+_POSTED_KEYS = ("postedAt", "listedAt", "date_posted", "postedDate", "publishedAt")
+_TITLE_KEYS = ("title", "jobTitle", "position")
+_LOCATION_KEYS = ("location", "jobLocation")
+
+
+class ApifySource(JobSource):
+    """Fetches recent LinkedIn jobs via an Apify actor (paid; actor-agnostic)."""
+
+    def fetch_recent_jobs(
+        self,
+        search_terms: list[str],
+        location: str,
+        hours_old: int,
+        results_wanted: int,
+    ) -> list[RawJob]:
+        settings = get_settings()
+        token = settings.apify_token
+        if not token or token.startswith("PLACEHOLDER"):
+            logger.error(
+                "Apify token is not configured (APIFY_TOKEN). "
+                "Set a real token or switch JOB_SOURCE=jobspy."
+            )
+            return []
+
+        run_input = {
+            "queries": search_terms,
+            "location": location,
+            "maxItems": results_wanted,
+        }
+        try:
+            items = self._run_actor(token, settings.apify_actor, run_input)
+        except Exception:
+            logger.exception("Apify actor run failed")
+            return []
+
+        seen: dict[str, RawJob] = {}
+        for item in items:
+            raw = self._to_raw_job(item)
+            if raw is None:
+                continue
+            if raw.job_id not in seen:
+                seen[raw.job_id] = raw
+
+        logger.info("Apify: %d unique jobs returned", len(seen))
+        return list(seen.values())
+
+    # ------------------------------------------------------------------
+    # Isolated Apify boundary — the one-line actor swap point.
+    # ------------------------------------------------------------------
+
+    def _run_actor(self, token: str, actor_id: str, run_input: dict) -> list[dict]:
+        """Call the Apify actor and return all dataset items as plain dicts."""
+        from apify_client import ApifyClient  # type: ignore[import-untyped]
+
+        client = ApifyClient(token)
+        run = client.actor(actor_id).call(run_input=run_input)
+        return list(client.dataset(run["defaultDatasetId"]).iterate_items())
+
+    # ------------------------------------------------------------------
+    # Field mapping
+    # ------------------------------------------------------------------
+
+    def _to_raw_job(self, item: dict) -> RawJob | None:
+        """Map one actor output item to RawJob; return None to skip."""
+        raw_url = _first(item, _URL_KEYS)
+        job_id = parse_linkedin_job_id(raw_url)
+        if not job_id:
+            logger.debug("Skipping Apify item with no LinkedIn job_id: %s", raw_url)
+            return None
+
+        company = _first(item, _COMPANY_KEYS)
+        if not company:
+            logger.debug("Skipping Apify job %s: no company name", job_id)
+            return None
+
+        from datetime import datetime
+
+        posted_raw = _first(item, _POSTED_KEYS, coerce=False)
+        listed_at: datetime | None = None
+        if posted_raw:
+            try:
+                listed_at = datetime.fromisoformat(str(posted_raw))
+            except (ValueError, TypeError):
+                pass
+
+        return RawJob(
+            job_id=job_id,
+            company=company,
+            linkedin_url=canonical_linkedin_url(job_id),
+            website=_first(item, _WEBSITE_KEYS),
+            listed_at=listed_at,
+            title=_first(item, _TITLE_KEYS),
+            location=_first(item, _LOCATION_KEYS),
+        )
+
+
+def _first(item: dict, keys: tuple[str, ...], *, coerce: bool = True) -> str | None:
+    """Return the first non-empty value found under any of keys."""
+    for k in keys:
+        v = item.get(k)
+        if coerce:
+            v = clean_value(v)
+        elif v is None:
+            continue
+        if v:
+            return str(v) if not coerce else v
+    return None
--- a/jobsource/sources/base.py
+++ b/jobsource/sources/base.py
@@ -1,16 +1,62 @@
-"""JobSource interface: every ingestion provider must implement fetch_recent_jobs().
+"""JobSource interface and shared LinkedIn URL helpers.

-Scaffold stub -- not implemented yet.
+Every ingestion provider implements JobSource. The helpers here are
+shared by all providers and have no external dependencies.
 """
-# TODO (Stage 1): define the JobSource ABC per CLAUDE.md "Stage 1 — Ingest (deterministic)".
-# Interface:
-#   class JobSource(ABC):
-#       @abstractmethod
-#       def fetch_recent_jobs(
-#           self,
-#           search_terms: list[str],
-#           location: str,
-#           hours_old: int,
-#           results_wanted: int,
-#       ) -> list[RawJob]: ...
-# Implementations: jobspy_source.JobSpySource, apify_source.ApifySource.
+from __future__ import annotations
+
+import re
+from abc import ABC, abstractmethod
+
+from ..models import RawJob
+
+_LINKEDIN_JOB_URL_RE = re.compile(r"/jobs/view/(\d+)")
+
+
+class JobSource(ABC):
+    """Abstract base for all ingestion providers."""
+
+    @abstractmethod
+    def fetch_recent_jobs(
+        self,
+        search_terms: list[str],
+        location: str,
+        hours_old: int,
+        results_wanted: int,
+    ) -> list[RawJob]:
+        """Return recent jobs matching search_terms in location.
+
+        Never raises — callers expect a list (possibly empty) on any error.
+        """
+
+
+def parse_linkedin_job_id(url: str | None) -> str | None:
+    """Extract the numeric jobPostingId from a LinkedIn job-view URL.
+
+    Returns None for None input or any URL that doesn't contain /jobs/view/{id}.
+    Tracking query params are ignored (we only look at the path segment).
+    """
+    if not url:
+        return None
+    m = _LINKEDIN_JOB_URL_RE.search(url)
+    return m.group(1) if m else None
+
+
+def canonical_linkedin_url(job_id: str) -> str:
+    """Return the clean LinkedIn job URL with no tracking params."""
+    return f"https://www.linkedin.com/jobs/view/{job_id}"
+
+
+def clean_value(value: object) -> str | None:
+    """Normalize a source cell to str | None.
+
+    Treats None, empty/whitespace strings, and float NaN (pandas sentinel)
+    as None — without importing pandas.
+    """
+    if value is None:
+        return None
+    # float NaN check without pandas: NaN is the only float where v != v
+    if isinstance(value, float) and value != value:
+        return None
+    s = str(value).strip()
+    return s if s else None
--- a/jobsource/sources/jobspy_source.py
+++ b/jobsource/sources/jobspy_source.py
@@ -1,10 +1,137 @@
 """JobSpy ingestion provider (default, free) — implements JobSource.

-Scaffold stub -- not implemented yet.
+Uses python-jobspy to search LinkedIn. The boundary between JobSpy's API
+and this module is _scrape(); everything else is plain mapping logic.
+
+JobSpy column names confirmed against live responses (update CLAUDE.md Gotchas
+when first verified): id, job_url, company, company_url_direct, date_posted,
+title, location. company_url_direct is the company's own site (not the LinkedIn
+company page); fill rate observed as low — resolve.py covers the gap.
 """
-# TODO (Stage 1): implement JobSpySource per CLAUDE.md "Stage 1 — Ingest".
-# Uses python-jobspy (python_jobspy). Key notes:
-#   - Search LinkedIn via JobSpy; parse LinkedIn numeric jobPostingId from the job URL.
-#   - Map JobSpy result fields → RawJob (company, website from company_url_direct if present).
-#   - Strip tracking query params from linkedin_url; keep only /jobs/view/{id}.
-#   - Log observed fill rate of company_url_direct (see CLAUDE.md Gotchas).
+from __future__ import annotations
+
+import logging
+from datetime import date, datetime
+
+from ..models import RawJob
+from .base import JobSource, canonical_linkedin_url, clean_value, parse_linkedin_job_id
+
+logger = logging.getLogger(__name__)
+
+
+class JobSpySource(JobSource):
+    """Fetches recent LinkedIn jobs via python-jobspy (no authentication required)."""
+
+    def fetch_recent_jobs(
+        self,
+        search_terms: list[str],
+        location: str,
+        hours_old: int,
+        results_wanted: int,
+    ) -> list[RawJob]:
+        seen: dict[str, RawJob] = {}
+        total_records = 0
+        with_website = 0
+
+        for term in search_terms:
+            try:
+                records = self._scrape(term, location, hours_old, results_wanted)
+            except Exception:
+                logger.exception("JobSpy scrape failed for term %r", term)
+                continue
+
+            for record in records:
+                total_records += 1
+                raw = self._to_raw_job(record)
+                if raw is None:
+                    continue
+                if raw.website:
+                    with_website += 1
+                if raw.job_id not in seen:
+                    seen[raw.job_id] = raw
+
+        fill_rate = (with_website / total_records * 100) if total_records else 0.0
+        logger.info(
+            "JobSpy: %d unique jobs from %d terms; company_url_direct fill rate %.0f%%",
+            len(seen),
+            len(search_terms),
+            fill_rate,
+        )
+        return list(seen.values())
+
+    # ------------------------------------------------------------------
+    # Isolated JobSpy boundary — swap provider here and in the import only.
+    # ------------------------------------------------------------------
+
+    def _scrape(
+        self, term: str, location: str, hours_old: int, results_wanted: int
+    ) -> list[dict]:
+        """Call python-jobspy and return raw records as plain dicts."""
+        from jobspy import scrape_jobs  # type: ignore[import-untyped]
+
+        df = scrape_jobs(
+            site_name=["linkedin"],
+            search_term=term,
+            location=location,
+            results_wanted=results_wanted,
+            hours_old=hours_old,
+            linkedin_fetch_description=False,
+        )
+        if df is None or df.empty:
+            return []
+        return df.to_dict("records")  # type: ignore[return-value]
+
+    # ------------------------------------------------------------------
+    # Field mapping
+    # ------------------------------------------------------------------
+
+    def _to_raw_job(self, record: dict) -> RawJob | None:
+        """Map one JobSpy record dict to RawJob; return None to skip."""
+        raw_url = clean_value(record.get("job_url"))
+        job_id = parse_linkedin_job_id(raw_url)
+        if not job_id:
+            # Fallback: JobSpy sometimes exposes a bare id column
+            job_id = clean_value(record.get("id"))
+        if not job_id:
+            logger.debug("Skipping record with no parseable job_id: %s", raw_url)
+            return None
+
+        company = clean_value(record.get("company"))
+        if not company:
+            logger.debug("Skipping job %s: no company name", job_id)
+            return None
+
+        # company_url_direct is the company's own site; company_url is the LinkedIn page.
+        website = clean_value(record.get("company_url_direct"))
+
+        return RawJob(
+            job_id=job_id,
+            company=company,
+            linkedin_url=canonical_linkedin_url(job_id),
+            website=website,
+            listed_at=_to_datetime(record.get("date_posted")),
+            title=clean_value(record.get("title")),
+            location=clean_value(record.get("location")),
+        )
+
+
+def _to_datetime(value: object) -> datetime | None:
+    """Coerce JobSpy date/datetime/string cells to datetime | None."""
+    if value is None:
+        return None
+    # float NaN (pandas sentinel) — NaN != NaN
+    if isinstance(value, float) and value != value:
+        return None
+    if isinstance(value, datetime):
+        return value
+    if isinstance(value, date):
+        return datetime(value.year, value.month, value.day)
+    # pandas NaT has isoformat but raises when compared; check type name to avoid import
+    if type(value).__name__ == "NaTType":
+        return None
+    if isinstance(value, str):
+        try:
+            return datetime.fromisoformat(value)
+        except ValueError:
+            return None
+    return None