"""Tests for sources/base.py, jobspy_source.py, apify_source.py, and the factory. All tests are network-free. Heavy provider deps (jobspy, apify-client) are never imported; their integration points (_scrape, _run_actor) are monkeypatched. """ from __future__ import annotations from datetime import date, datetime import pytest from jobsource.config import get_settings from jobsource.sources import JobSource, get_job_source from jobsource.sources.apify_source import ApifySource from jobsource.sources.base import ( canonical_linkedin_url, clean_value, parse_linkedin_job_id, ) from jobsource.sources.jobspy_source import JobSpySource, _to_datetime # --------------------------------------------------------------------------- # base helpers # --------------------------------------------------------------------------- class TestParseLinkedinJobId: def test_clean_url(self): assert parse_linkedin_job_id("https://www.linkedin.com/jobs/view/1234567890") == "1234567890" def test_trailing_slash(self): assert parse_linkedin_job_id("https://www.linkedin.com/jobs/view/999/") == "999" def test_tracking_params_ignored(self): url = "https://www.linkedin.com/jobs/view/42?refId=abc&trackingId=xyz" assert parse_linkedin_job_id(url) == "42" def test_none_input(self): assert parse_linkedin_job_id(None) is None def test_non_job_url(self): assert parse_linkedin_job_id("https://www.linkedin.com/company/acme") is None def test_empty_string(self): assert parse_linkedin_job_id("") is None class TestCanonicalLinkedinUrl: def test_formats_correctly(self): assert canonical_linkedin_url("123") == "https://www.linkedin.com/jobs/view/123" class TestCleanValue: def test_none(self): assert clean_value(None) is None def test_empty_string(self): assert clean_value("") is None def test_whitespace(self): assert clean_value(" ") is None def test_nan(self): assert clean_value(float("nan")) is None def test_normal_string(self): assert clean_value(" Acme Corp ") == "Acme Corp" def test_non_string_coerced(self): assert clean_value(42) == "42" def test_zero_is_kept(self): assert clean_value(0) == "0" # --------------------------------------------------------------------------- # _to_datetime (module-level helper in jobspy_source) # --------------------------------------------------------------------------- class TestToDatetime: def test_none(self): assert _to_datetime(None) is None def test_nan(self): assert _to_datetime(float("nan")) is None def test_datetime(self): dt = datetime(2024, 1, 15, 12, 0) assert _to_datetime(dt) == dt def test_date(self): result = _to_datetime(date(2024, 1, 15)) assert result == datetime(2024, 1, 15) def test_iso_string(self): assert _to_datetime("2024-01-15") == datetime(2024, 1, 15) def test_bad_string(self): assert _to_datetime("not a date") is None def test_nat_type_name(self): class FakeNaT: __name__ = "NaTType" obj = FakeNaT() type(obj).__name__ = "NaTType" assert _to_datetime(obj) is None # --------------------------------------------------------------------------- # JobSpySource._to_raw_job # --------------------------------------------------------------------------- class TestJobSpyToRawJob: def _src(self): return JobSpySource() def _record(self, **overrides) -> dict: base = { "job_url": "https://www.linkedin.com/jobs/view/100", "company": "Acme Corp", "company_url_direct": "https://acme.com", "date_posted": "2024-06-01", "title": "Engineer", "location": "Remote", "id": None, } base.update(overrides) return base def test_basic_mapping(self): raw = self._src()._to_raw_job(self._record()) assert raw is not None assert raw.job_id == "100" assert raw.company == "Acme Corp" assert raw.website == "https://acme.com" assert raw.linkedin_url == "https://www.linkedin.com/jobs/view/100" assert raw.listed_at == datetime(2024, 6, 1) assert raw.title == "Engineer" assert raw.location == "Remote" def test_website_from_company_url_direct_not_company_url(self): record = self._record(company_url_direct=None, company_url="https://linkedin.com/company/acme") raw = self._src()._to_raw_job(record) assert raw is not None assert raw.website is None # company_url (LinkedIn page) must NOT be used def test_nan_website_becomes_none(self): raw = self._src()._to_raw_job(self._record(company_url_direct=float("nan"))) assert raw is not None assert raw.website is None def test_missing_job_id_returns_none(self): record = self._record(job_url="https://example.com/not-a-linkedin-url", id=None) assert self._src()._to_raw_job(record) is None def test_bare_id_fallback(self): record = self._record(job_url=None, id="999") raw = self._src()._to_raw_job(record) assert raw is not None assert raw.job_id == "999" def test_missing_company_returns_none(self): raw = self._src()._to_raw_job(self._record(company=None)) assert raw is None def test_linkedin_url_is_canonical(self): record = self._record(job_url="https://www.linkedin.com/jobs/view/55?tracking=abc") raw = self._src()._to_raw_job(record) assert raw is not None assert raw.linkedin_url == "https://www.linkedin.com/jobs/view/55" assert "tracking" not in raw.linkedin_url # --------------------------------------------------------------------------- # JobSpySource.fetch_recent_jobs (monkeypatched _scrape) # --------------------------------------------------------------------------- class TestJobSpyFetchRecentJobs: def _make_record(self, job_id: str, term_suffix: str = "") -> dict: return { "job_url": f"https://www.linkedin.com/jobs/view/{job_id}", "company": f"Acme{term_suffix}", "company_url_direct": None, "date_posted": None, "title": "Eng", "location": "Remote", "id": None, } def test_dedup_across_terms(self, monkeypatch): src = JobSpySource() calls = [] def fake_scrape(term, location, hours_old, results_wanted): calls.append(term) if term == "engineer": return [self._make_record("1"), self._make_record("2")] # "developer" returns job 2 again + a new job 3 return [self._make_record("2"), self._make_record("3")] monkeypatch.setattr(src, "_scrape", fake_scrape) results = src.fetch_recent_jobs(["engineer", "developer"], "US", 72, 10) ids = {r.job_id for r in results} assert ids == {"1", "2", "3"} # deduped; "2" not duplicated assert len(calls) == 2 def test_failing_scrape_returns_empty(self, monkeypatch): src = JobSpySource() monkeypatch.setattr(src, "_scrape", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("boom"))) results = src.fetch_recent_jobs(["engineer"], "US", 72, 10) assert results == [] def test_empty_scrape(self, monkeypatch): src = JobSpySource() monkeypatch.setattr(src, "_scrape", lambda *a, **k: []) results = src.fetch_recent_jobs(["engineer"], "US", 72, 10) assert results == [] # --------------------------------------------------------------------------- # ApifySource._to_raw_job # --------------------------------------------------------------------------- class TestApifyToRawJob: def _src(self): return ApifySource() def test_camel_case_keys(self): item = { "jobUrl": "https://www.linkedin.com/jobs/view/77", "companyName": "BigCo", "companyWebsite": "https://bigco.com", "postedAt": "2024-03-01T00:00:00", "title": "PM", "location": "NYC", } raw = self._src()._to_raw_job(item) assert raw is not None assert raw.job_id == "77" assert raw.company == "BigCo" assert raw.website == "https://bigco.com" assert raw.title == "PM" def test_snake_case_keys(self): item = { "job_url": "https://www.linkedin.com/jobs/view/88", "company": "LilCo", "website": "https://lilco.io", "date_posted": "2024-04-01", "title": "SWE", "location": "SF", } raw = self._src()._to_raw_job(item) assert raw is not None assert raw.job_id == "88" assert raw.website == "https://lilco.io" def test_no_linkedin_url_returns_none(self): item = {"url": "https://example.com/job/99", "company": "X"} assert self._src()._to_raw_job(item) is None def test_no_company_returns_none(self): item = {"jobUrl": "https://www.linkedin.com/jobs/view/55"} assert self._src()._to_raw_job(item) is None # --------------------------------------------------------------------------- # ApifySource.fetch_recent_jobs (monkeypatched _run_actor) # --------------------------------------------------------------------------- class TestApifyFetchRecentJobs: def test_placeholder_token_returns_empty(self): # Default settings have placeholder token src = ApifySource() results = src.fetch_recent_jobs(["engineer"], "US", 72, 10) assert results == [] def test_run_actor_failure_returns_empty(self, monkeypatch): src = ApifySource() monkeypatch.setattr(get_settings(), "apify_token", "real-token-abc") monkeypatch.setattr(src, "_run_actor", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("api error"))) results = src.fetch_recent_jobs(["engineer"], "US", 72, 10) assert results == [] get_settings.cache_clear() def test_dedup(self, monkeypatch): src = ApifySource() monkeypatch.setattr(get_settings(), "apify_token", "real-token") items = [ {"jobUrl": f"https://www.linkedin.com/jobs/view/{i}", "company": "Co"} for i in [10, 10, 20] ] monkeypatch.setattr(src, "_run_actor", lambda *a, **k: items) results = src.fetch_recent_jobs(["eng"], "US", 72, 10) assert {r.job_id for r in results} == {"10", "20"} get_settings.cache_clear() # --------------------------------------------------------------------------- # Factory # --------------------------------------------------------------------------- class TestGetJobSource: def test_default_returns_jobspy(self): src = get_job_source() assert isinstance(src, JobSpySource) def test_apify_returns_apify(self, monkeypatch): get_settings.cache_clear() monkeypatch.setenv("JOB_SOURCE", "apify") get_settings.cache_clear() src = get_job_source(get_settings()) assert isinstance(src, ApifySource) get_settings.cache_clear() def test_unknown_raises(self, monkeypatch): get_settings.cache_clear() monkeypatch.setenv("JOB_SOURCE", "indeed") get_settings.cache_clear() with pytest.raises(ValueError, match="indeed"): get_job_source(get_settings()) get_settings.cache_clear() def test_returns_job_source_abc(self): assert isinstance(get_job_source(), JobSource)