"""Resolve company name → company website URL (Stage 1b, deterministic). Three-tier cascade — returns on first hit: Tier 1: provider-supplied website (trusted, no network call). Tier 2: verified {slug}.com guess (HTTP HEAD/GET probe). Tier 3: search API (gated by SEARCH_API_ENABLED; ships as a stub — wire a real provider in _search_api_lookup() when ready). """ from __future__ import annotations import logging import re import httpx from .config import get_settings from .http import build_client, request_with_retries logger = logging.getLogger(__name__) # Legal suffix words stripped when building the domain slug. _LEGAL_SUFFIXES = re.compile( r"\b(inc|llc|ltd|corp|co|gmbh|plc|sa|ag|pbc|lp|llp)\b", re.IGNORECASE, ) _NON_ALNUM = re.compile(r"[^a-z0-9]+") # HEAD responses that indicate the server doesn't support HEAD — retry with GET. _HEAD_NOT_SUPPORTED = frozenset({405, 501}) def resolve_website( company_name: str, website: str | None = None, *, client: httpx.Client | None = None, ) -> str | None: """Return the company's own website URL, or None if unresolvable. Pass an existing httpx.Client to reuse connections across many calls; otherwise a short-lived client is created and closed here. """ settings = get_settings() _managed = client is None if _managed: client = build_client() try: # Tier 1 — provider-supplied website (trusted, no network needed). if website and not website.startswith("PLACEHOLDER"): resolved = _normalize_scheme(website) logger.info("resolve(%s): tier=provider url=%s", company_name, resolved) return resolved # Tier 2 — {slug}.com guess with HTTP verification. slug = _slug(company_name) if slug: guessed = f"https://{slug}.com" verified = _verify(client, guessed) if verified: logger.info("resolve(%s): tier=slug_guess url=%s", company_name, verified) return verified # Tier 3 — optional search API (gated; stub by default). if settings.search_api_enabled and not settings.search_api_key.startswith("PLACEHOLDER"): result = _search_api_lookup(company_name, client) if result: logger.info("resolve(%s): tier=search_api url=%s", company_name, result) return result logger.info("resolve(%s): unresolvable (all tiers missed)", company_name) return None finally: if _managed: client.close() # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _normalize_scheme(url: str) -> str: """Add https:// if the URL has no scheme.""" if url.startswith(("http://", "https://")): return url return f"https://{url}" def _slug(name: str) -> str | None: """Normalize company name to a domain slug (lowercase, no legal suffixes, alnum only).""" s = name.lower() s = _LEGAL_SUFFIXES.sub("", s) s = _NON_ALNUM.sub("", s) return s or None def _verify(client: httpx.Client, url: str) -> str | None: """Probe url with HEAD (fallback to GET on 405/501); return final URL or None.""" try: resp = request_with_retries(client, "HEAD", url, max_retries=1) if resp.status_code in _HEAD_NOT_SUPPORTED: resp = request_with_retries(client, "GET", url, max_retries=1) if resp.status_code < 400: return str(resp.url) return None except Exception: return None def _search_api_lookup(company_name: str, client: httpx.Client) -> str | None: """Search API fallback — returns the company's website from a web search. Integration point: implement by querying a search API with f'"{company_name}" official website' using settings.search_api_key, extracting the registrable domain from the top organic result's URL, verifying it with _verify(), and returning the URL or None. Currently a stub; enabled only when SEARCH_API_ENABLED=true and a real SEARCH_API_KEY is set. """ return None