phase1-ingest-resolve
This commit is contained in:
@@ -1,10 +1,122 @@
|
||||
"""Resolve company name → company website URL (Stage 1b, deterministic).
|
||||
|
||||
Scaffold stub -- not implemented yet.
|
||||
Three-tier cascade — returns on first hit:
|
||||
Tier 1: provider-supplied website (trusted, no network call).
|
||||
Tier 2: verified {slug}.com guess (HTTP HEAD/GET probe).
|
||||
Tier 3: search API (gated by SEARCH_API_ENABLED; ships as a stub — wire
|
||||
a real provider in _search_api_lookup() when ready).
|
||||
"""
|
||||
# TODO (Stage 1b): implement per CLAUDE.md "Stage 1b — Resolve website (deterministic)".
|
||||
# Resolution order:
|
||||
# 1. Use provider-supplied website if present.
|
||||
# 2. Verified domain guess: normalize company name to {slug}.com and probe via HTTP HEAD.
|
||||
# 3. Optional search API (SEARCH_API_ENABLED=true) as final fallback.
|
||||
# Returns the resolved URL string, or None if unresolvable.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
import httpx
|
||||
|
||||
from .config import get_settings
|
||||
from .http import build_client, request_with_retries
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Legal suffix words stripped when building the domain slug.
|
||||
_LEGAL_SUFFIXES = re.compile(
|
||||
r"\b(inc|llc|ltd|corp|co|gmbh|plc|sa|ag|pbc|lp|llp)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_NON_ALNUM = re.compile(r"[^a-z0-9]+")
|
||||
|
||||
# HEAD responses that indicate the server doesn't support HEAD — retry with GET.
|
||||
_HEAD_NOT_SUPPORTED = frozenset({405, 501})
|
||||
|
||||
|
||||
def resolve_website(
|
||||
company_name: str,
|
||||
website: str | None = None,
|
||||
*,
|
||||
client: httpx.Client | None = None,
|
||||
) -> str | None:
|
||||
"""Return the company's own website URL, or None if unresolvable.
|
||||
|
||||
Pass an existing httpx.Client to reuse connections across many calls;
|
||||
otherwise a short-lived client is created and closed here.
|
||||
"""
|
||||
settings = get_settings()
|
||||
_managed = client is None
|
||||
if _managed:
|
||||
client = build_client()
|
||||
|
||||
try:
|
||||
# Tier 1 — provider-supplied website (trusted, no network needed).
|
||||
if website and not website.startswith("PLACEHOLDER"):
|
||||
resolved = _normalize_scheme(website)
|
||||
logger.info("resolve(%s): tier=provider url=%s", company_name, resolved)
|
||||
return resolved
|
||||
|
||||
# Tier 2 — {slug}.com guess with HTTP verification.
|
||||
slug = _slug(company_name)
|
||||
if slug:
|
||||
guessed = f"https://{slug}.com"
|
||||
verified = _verify(client, guessed)
|
||||
if verified:
|
||||
logger.info("resolve(%s): tier=slug_guess url=%s", company_name, verified)
|
||||
return verified
|
||||
|
||||
# Tier 3 — optional search API (gated; stub by default).
|
||||
if settings.search_api_enabled and not settings.search_api_key.startswith("PLACEHOLDER"):
|
||||
result = _search_api_lookup(company_name, client)
|
||||
if result:
|
||||
logger.info("resolve(%s): tier=search_api url=%s", company_name, result)
|
||||
return result
|
||||
|
||||
logger.info("resolve(%s): unresolvable (all tiers missed)", company_name)
|
||||
return None
|
||||
finally:
|
||||
if _managed:
|
||||
client.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _normalize_scheme(url: str) -> str:
|
||||
"""Add https:// if the URL has no scheme."""
|
||||
if url.startswith(("http://", "https://")):
|
||||
return url
|
||||
return f"https://{url}"
|
||||
|
||||
|
||||
def _slug(name: str) -> str | None:
|
||||
"""Normalize company name to a domain slug (lowercase, no legal suffixes, alnum only)."""
|
||||
s = name.lower()
|
||||
s = _LEGAL_SUFFIXES.sub("", s)
|
||||
s = _NON_ALNUM.sub("", s)
|
||||
return s or None
|
||||
|
||||
|
||||
def _verify(client: httpx.Client, url: str) -> str | None:
|
||||
"""Probe url with HEAD (fallback to GET on 405/501); return final URL or None."""
|
||||
try:
|
||||
resp = request_with_retries(client, "HEAD", url, max_retries=1)
|
||||
if resp.status_code in _HEAD_NOT_SUPPORTED:
|
||||
resp = request_with_retries(client, "GET", url, max_retries=1)
|
||||
if resp.status_code < 400:
|
||||
return str(resp.url)
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _search_api_lookup(company_name: str, client: httpx.Client) -> str | None:
|
||||
"""Search API fallback — returns the company's website from a web search.
|
||||
|
||||
Integration point: implement by querying a search API with
|
||||
f'"{company_name}" official website' using settings.search_api_key,
|
||||
extracting the registrable domain from the top organic result's URL,
|
||||
verifying it with _verify(), and returning the URL or None.
|
||||
|
||||
Currently a stub; enabled only when SEARCH_API_ENABLED=true and a real
|
||||
SEARCH_API_KEY is set.
|
||||
"""
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user