"""JobSource interface and shared LinkedIn URL helpers. Every ingestion provider implements JobSource. The helpers here are shared by all providers and have no external dependencies. """ from __future__ import annotations import re from abc import ABC, abstractmethod from ..models import RawJob _LINKEDIN_JOB_URL_RE = re.compile(r"/jobs/view/(\d+)") class JobSource(ABC): """Abstract base for all ingestion providers.""" @abstractmethod def fetch_recent_jobs( self, search_terms: list[str], location: str, hours_old: int, results_wanted: int, ) -> list[RawJob]: """Return recent jobs matching search_terms in location. Never raises — callers expect a list (possibly empty) on any error. """ def parse_linkedin_job_id(url: str | None) -> str | None: """Extract the numeric jobPostingId from a LinkedIn job-view URL. Returns None for None input or any URL that doesn't contain /jobs/view/{id}. Tracking query params are ignored (we only look at the path segment). """ if not url: return None m = _LINKEDIN_JOB_URL_RE.search(url) return m.group(1) if m else None def canonical_linkedin_url(job_id: str) -> str: """Return the clean LinkedIn job URL with no tracking params.""" return f"https://www.linkedin.com/jobs/view/{job_id}" def clean_value(value: object) -> str | None: """Normalize a source cell to str | None. Treats None, empty/whitespace strings, and float NaN (pandas sentinel) as None — without importing pandas. """ if value is None: return None # float NaN check without pandas: NaN is the only float where v != v if isinstance(value, float) and value != value: return None s = str(value).strip() return s if s else None