omniread/mcp_docs/modules/omniread.html.json

{
  "module": "omniread.html",
  "content": {
    "path": "omniread.html",
    "docstring": "HTML format implementation for OmniRead.\n\nThis package provides **HTML-specific implementations** of the core OmniRead\ncontracts defined in `omniread.core`.\n\nIt includes:\n- HTML parsers that interpret HTML content\n- HTML scrapers that retrieve HTML documents\n\nThis package:\n- Implements, but does not redefine, core contracts\n- May contain HTML-specific behavior and edge-case handling\n- Produces canonical content models defined in `omniread.core.content`\n\nConsumers should depend on `omniread.core` interfaces wherever possible and\nuse this package only when HTML-specific behavior is required.",
    "objects": {
      "HTMLScraper": {
        "name": "HTMLScraper",
        "kind": "class",
        "path": "omniread.html.HTMLScraper",
        "signature": "<bound method Alias.signature of Alias('HTMLScraper', 'omniread.html.scraper.HTMLScraper')>",
        "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses",
        "members": {
          "content_type": {
            "name": "content_type",
            "kind": "attribute",
            "path": "omniread.html.HTMLScraper.content_type",
            "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.html.scraper.HTMLScraper.content_type')>",
            "docstring": null
          },
          "validate_content_type": {
            "name": "validate_content_type",
            "kind": "function",
            "path": "omniread.html.HTMLScraper.validate_content_type",
            "signature": "<bound method Alias.signature of Alias('validate_content_type', 'omniread.html.scraper.HTMLScraper.validate_content_type')>",
            "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n    response: HTTP response returned by `httpx`.\n\nRaises:\n    ValueError: If the `Content-Type` header is missing or does not\n        indicate HTML content."
          },
          "fetch": {
            "name": "fetch",
            "kind": "function",
            "path": "omniread.html.HTMLScraper.fetch",
            "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.html.scraper.HTMLScraper.fetch')>",
            "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n    source: URL of the HTML document.\n    metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw HTML bytes\n    - Source URL\n    - HTML content type\n    - HTTP response metadata\n\nRaises:\n    httpx.HTTPError: If the HTTP request fails.\n    ValueError: If the response is not valid HTML."
          }
        }
      },
      "HTMLParser": {
        "name": "HTMLParser",
        "kind": "class",
        "path": "omniread.html.HTMLParser",
        "signature": "<bound method Alias.signature of Alias('HTMLParser', 'omniread.html.parser.HTMLParser')>",
        "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method",
        "members": {
          "supported_types": {
            "name": "supported_types",
            "kind": "attribute",
            "path": "omniread.html.HTMLParser.supported_types",
            "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.html.parser.HTMLParser.supported_types')>",
            "docstring": "Set of content types supported by this parser (HTML only)."
          },
          "parse": {
            "name": "parse",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse",
            "signature": "<bound method Alias.signature of Alias('parse', 'omniread.html.parser.HTMLParser.parse')>",
            "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`."
          },
          "parse_div": {
            "name": "parse_div",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_div",
            "signature": "<bound method Alias.signature of Alias('parse_div', 'omniread.html.parser.HTMLParser.parse_div')>",
            "docstring": "Extract normalized text from a `<div>` element.\n\nArgs:\n    div: BeautifulSoup tag representing a `<div>`.\n    separator: String used to separate text nodes.\n\nReturns:\n    Flattened, whitespace-normalized text content."
          },
          "parse_link": {
            "name": "parse_link",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_link",
            "signature": "<bound method Alias.signature of Alias('parse_link', 'omniread.html.parser.HTMLParser.parse_link')>",
            "docstring": "Extract the hyperlink reference from an `<a>` element.\n\nArgs:\n    a: BeautifulSoup tag representing an anchor.\n\nReturns:\n    The value of the `href` attribute, or None if absent."
          },
          "parse_table": {
            "name": "parse_table",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_table",
            "signature": "<bound method Alias.signature of Alias('parse_table', 'omniread.html.parser.HTMLParser.parse_table')>",
            "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n    table: BeautifulSoup tag representing a `<table>`.\n\nReturns:\n    A list of rows, where each row is a list of cell text values."
          },
          "parse_meta": {
            "name": "parse_meta",
            "kind": "function",
            "path": "omniread.html.HTMLParser.parse_meta",
            "signature": "<bound method Alias.signature of Alias('parse_meta', 'omniread.html.parser.HTMLParser.parse_meta')>",
            "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `<meta>` tag name/property → content mappings\n\nReturns:\n    Dictionary containing extracted metadata."
          }
        }
      },
      "parser": {
        "name": "parser",
        "kind": "module",
        "path": "omniread.html.parser",
        "signature": null,
        "docstring": "HTML parser base implementations for OmniRead.\n\nThis module provides reusable HTML parsing utilities built on top of\nthe abstract parser contracts defined in `omniread.core.parser`.\n\nIt supplies:\n- Content-type enforcement for HTML inputs\n- BeautifulSoup initialization and lifecycle management\n- Common helper methods for extracting structured data from HTML elements\n\nConcrete parsers must subclass `HTMLParser` and implement the `parse()` method\nto return a structured representation appropriate for their use case.",
        "members": {
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.html.parser.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "Generic": {
            "name": "Generic",
            "kind": "alias",
            "path": "omniread.html.parser.Generic",
            "signature": "<bound method Alias.signature of Alias('Generic', 'typing.Generic')>",
            "docstring": null
          },
          "TypeVar": {
            "name": "TypeVar",
            "kind": "alias",
            "path": "omniread.html.parser.TypeVar",
            "signature": "<bound method Alias.signature of Alias('TypeVar', 'typing.TypeVar')>",
            "docstring": null
          },
          "Optional": {
            "name": "Optional",
            "kind": "alias",
            "path": "omniread.html.parser.Optional",
            "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
            "docstring": null
          },
          "abstractmethod": {
            "name": "abstractmethod",
            "kind": "alias",
            "path": "omniread.html.parser.abstractmethod",
            "signature": "<bound method Alias.signature of Alias('abstractmethod', 'abc.abstractmethod')>",
            "docstring": null
          },
          "BeautifulSoup": {
            "name": "BeautifulSoup",
            "kind": "alias",
            "path": "omniread.html.parser.BeautifulSoup",
            "signature": "<bound method Alias.signature of Alias('BeautifulSoup', 'bs4.BeautifulSoup')>",
            "docstring": null
          },
          "Tag": {
            "name": "Tag",
            "kind": "alias",
            "path": "omniread.html.parser.Tag",
            "signature": "<bound method Alias.signature of Alias('Tag', 'bs4.Tag')>",
            "docstring": null
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.html.parser.ContentType",
            "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.HTML",
                "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.PDF",
                "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.JSON",
                "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.html.parser.ContentType.XML",
                "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
                "docstring": "XML document content."
              }
            }
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.html.parser.Content",
            "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.raw",
                "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.source",
                "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.content_type",
                "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.html.parser.Content.metadata",
                "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
                "docstring": null
              }
            }
          },
          "BaseParser": {
            "name": "BaseParser",
            "kind": "class",
            "path": "omniread.html.parser.BaseParser",
            "signature": "<bound method Alias.signature of Alias('BaseParser', 'omniread.core.parser.BaseParser')>",
            "docstring": "Base interface for all parsers.\n\nA parser is a self-contained object that owns the Content\nit is responsible for interpreting.\n\nImplementations must:\n- Declare supported content types via `supported_types`\n- Raise parsing-specific exceptions from `parse()`\n- Remain deterministic for a given input\n\nConsumers may rely on:\n- Early validation of content compatibility\n- Type-stable return values from `parse()`",
            "members": {
              "supported_types": {
                "name": "supported_types",
                "kind": "attribute",
                "path": "omniread.html.parser.BaseParser.supported_types",
                "signature": "<bound method Alias.signature of Alias('supported_types', 'omniread.core.parser.BaseParser.supported_types')>",
                "docstring": "Set of content types supported by this parser.\n\nAn empty set indicates that the parser is content-type agnostic."
              },
              "content": {
                "name": "content",
                "kind": "attribute",
                "path": "omniread.html.parser.BaseParser.content",
                "signature": "<bound method Alias.signature of Alias('content', 'omniread.core.parser.BaseParser.content')>",
                "docstring": null
              },
              "parse": {
                "name": "parse",
                "kind": "function",
                "path": "omniread.html.parser.BaseParser.parse",
                "signature": "<bound method Alias.signature of Alias('parse', 'omniread.core.parser.BaseParser.parse')>",
                "docstring": "Parse the owned content into structured output.\n\nImplementations must fully consume the provided content and\nreturn a deterministic, structured output.\n\nReturns:\n    Parsed, structured representation.\n\nRaises:\n    Exception: Parsing-specific errors as defined by the implementation."
              },
              "supports": {
                "name": "supports",
                "kind": "function",
                "path": "omniread.html.parser.BaseParser.supports",
                "signature": "<bound method Alias.signature of Alias('supports', 'omniread.core.parser.BaseParser.supports')>",
                "docstring": "Check whether this parser supports the content's type.\n\nReturns:\n    True if the content type is supported; False otherwise."
              }
            }
          },
          "T": {
            "name": "T",
            "kind": "attribute",
            "path": "omniread.html.parser.T",
            "signature": null,
            "docstring": null
          },
          "HTMLParser": {
            "name": "HTMLParser",
            "kind": "class",
            "path": "omniread.html.parser.HTMLParser",
            "signature": "<bound method Class.signature of Class('HTMLParser', 27, 177)>",
            "docstring": "Base HTML parser.\n\nThis class extends the core `BaseParser` with HTML-specific behavior,\nincluding DOM parsing via BeautifulSoup and reusable extraction helpers.\n\nProvides reusable helpers for HTML extraction.\nConcrete parsers must explicitly define the return type.\n\nCharacteristics:\n- Accepts only HTML content\n- Owns a parsed BeautifulSoup DOM tree\n- Provides pure helper utilities for common HTML structures\n\nConcrete subclasses must:\n- Define the output type `T`\n- Implement the `parse()` method",
            "members": {
              "supported_types": {
                "name": "supported_types",
                "kind": "attribute",
                "path": "omniread.html.parser.HTMLParser.supported_types",
                "signature": null,
                "docstring": "Set of content types supported by this parser (HTML only)."
              },
              "parse": {
                "name": "parse",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse",
                "signature": "<bound method Function.signature of Function('parse', 70, 81)>",
                "docstring": "Fully parse the HTML content into structured output.\n\nImplementations must fully interpret the HTML DOM and return\na deterministic, structured output.\n\nReturns:\n    Parsed representation of type `T`."
              },
              "parse_div": {
                "name": "parse_div",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_div",
                "signature": "<bound method Function.signature of Function('parse_div', 87, 99)>",
                "docstring": "Extract normalized text from a `<div>` element.\n\nArgs:\n    div: BeautifulSoup tag representing a `<div>`.\n    separator: String used to separate text nodes.\n\nReturns:\n    Flattened, whitespace-normalized text content."
              },
              "parse_link": {
                "name": "parse_link",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_link",
                "signature": "<bound method Function.signature of Function('parse_link', 101, 112)>",
                "docstring": "Extract the hyperlink reference from an `<a>` element.\n\nArgs:\n    a: BeautifulSoup tag representing an anchor.\n\nReturns:\n    The value of the `href` attribute, or None if absent."
              },
              "parse_table": {
                "name": "parse_table",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_table",
                "signature": "<bound method Function.signature of Function('parse_table', 114, 133)>",
                "docstring": "Parse an HTML table into a 2D list of strings.\n\nArgs:\n    table: BeautifulSoup tag representing a `<table>`.\n\nReturns:\n    A list of rows, where each row is a list of cell text values."
              },
              "parse_meta": {
                "name": "parse_meta",
                "kind": "function",
                "path": "omniread.html.parser.HTMLParser.parse_meta",
                "signature": "<bound method Function.signature of Function('parse_meta', 153, 177)>",
                "docstring": "Extract high-level metadata from the HTML document.\n\nThis includes:\n- Document title\n- `<meta>` tag name/property → content mappings\n\nReturns:\n    Dictionary containing extracted metadata."
              }
            }
          },
          "list": {
            "name": "list",
            "kind": "alias",
            "path": "omniread.html.parser.list",
            "signature": "<bound method Alias.signature of Alias('list', 'typing.list')>",
            "docstring": null
          },
          "dict": {
            "name": "dict",
            "kind": "alias",
            "path": "omniread.html.parser.dict",
            "signature": "<bound method Alias.signature of Alias('dict', 'typing.dict')>",
            "docstring": null
          }
        }
      },
      "scraper": {
        "name": "scraper",
        "kind": "module",
        "path": "omniread.html.scraper",
        "signature": null,
        "docstring": "HTML scraping implementation for OmniRead.\n\nThis module provides an HTTP-based scraper for retrieving HTML documents.\nIt implements the core `BaseScraper` contract using `httpx` as the transport\nlayer.\n\nThis scraper is responsible for:\n- Fetching raw HTML bytes over HTTP(S)\n- Validating response content type\n- Attaching HTTP metadata to the returned content\n\nThis scraper is not responsible for:\n- Parsing or interpreting HTML\n- Retrying failed requests\n- Managing crawl policies or rate limiting",
        "members": {
          "httpx": {
            "name": "httpx",
            "kind": "alias",
            "path": "omniread.html.scraper.httpx",
            "signature": "<bound method Alias.signature of Alias('httpx', 'httpx')>",
            "docstring": null
          },
          "Any": {
            "name": "Any",
            "kind": "alias",
            "path": "omniread.html.scraper.Any",
            "signature": "<bound method Alias.signature of Alias('Any', 'typing.Any')>",
            "docstring": null
          },
          "Mapping": {
            "name": "Mapping",
            "kind": "alias",
            "path": "omniread.html.scraper.Mapping",
            "signature": "<bound method Alias.signature of Alias('Mapping', 'typing.Mapping')>",
            "docstring": null
          },
          "Optional": {
            "name": "Optional",
            "kind": "alias",
            "path": "omniread.html.scraper.Optional",
            "signature": "<bound method Alias.signature of Alias('Optional', 'typing.Optional')>",
            "docstring": null
          },
          "Content": {
            "name": "Content",
            "kind": "class",
            "path": "omniread.html.scraper.Content",
            "signature": "<bound method Alias.signature of Alias('Content', 'omniread.core.content.Content')>",
            "docstring": "Normalized representation of extracted content.\n\nA `Content` instance represents a raw content payload along with minimal\ncontextual metadata describing its origin and type.\n\nThis class is the **primary exchange format** between:\n- Scrapers\n- Parsers\n- Downstream consumers\n\nAttributes:\n    raw: Raw content bytes as retrieved from the source.\n    source: Identifier of the content origin (URL, file path, or logical name).\n    content_type: Optional MIME type of the content, if known.\n    metadata: Optional, implementation-defined metadata associated with\n        the content (e.g., headers, encoding hints, extraction notes).",
            "members": {
              "raw": {
                "name": "raw",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.raw",
                "signature": "<bound method Alias.signature of Alias('raw', 'omniread.core.content.Content.raw')>",
                "docstring": null
              },
              "source": {
                "name": "source",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.source",
                "signature": "<bound method Alias.signature of Alias('source', 'omniread.core.content.Content.source')>",
                "docstring": null
              },
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.content_type",
                "signature": "<bound method Alias.signature of Alias('content_type', 'omniread.core.content.Content.content_type')>",
                "docstring": null
              },
              "metadata": {
                "name": "metadata",
                "kind": "attribute",
                "path": "omniread.html.scraper.Content.metadata",
                "signature": "<bound method Alias.signature of Alias('metadata', 'omniread.core.content.Content.metadata')>",
                "docstring": null
              }
            }
          },
          "ContentType": {
            "name": "ContentType",
            "kind": "class",
            "path": "omniread.html.scraper.ContentType",
            "signature": "<bound method Alias.signature of Alias('ContentType', 'omniread.core.content.ContentType')>",
            "docstring": "Supported MIME types for extracted content.\n\nThis enum represents the declared or inferred media type of the content\nsource. It is primarily used for routing content to the appropriate\nparser or downstream consumer.",
            "members": {
              "HTML": {
                "name": "HTML",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.HTML",
                "signature": "<bound method Alias.signature of Alias('HTML', 'omniread.core.content.ContentType.HTML')>",
                "docstring": "HTML document content."
              },
              "PDF": {
                "name": "PDF",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.PDF",
                "signature": "<bound method Alias.signature of Alias('PDF', 'omniread.core.content.ContentType.PDF')>",
                "docstring": "PDF document content."
              },
              "JSON": {
                "name": "JSON",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.JSON",
                "signature": "<bound method Alias.signature of Alias('JSON', 'omniread.core.content.ContentType.JSON')>",
                "docstring": "JSON document content."
              },
              "XML": {
                "name": "XML",
                "kind": "attribute",
                "path": "omniread.html.scraper.ContentType.XML",
                "signature": "<bound method Alias.signature of Alias('XML', 'omniread.core.content.ContentType.XML')>",
                "docstring": "XML document content."
              }
            }
          },
          "BaseScraper": {
            "name": "BaseScraper",
            "kind": "class",
            "path": "omniread.html.scraper.BaseScraper",
            "signature": "<bound method Alias.signature of Alias('BaseScraper', 'omniread.core.scraper.BaseScraper')>",
            "docstring": "Base interface for all scrapers.\n\nA scraper is responsible ONLY for fetching raw content\n(bytes) from a source. It must not interpret or parse it.\n\nA scraper is a **stateless acquisition component** that retrieves raw\ncontent from a source and returns it as a `Content` object.\n\nScrapers define *how content is obtained*, not *what the content means*.\n\nImplementations may vary in:\n- Transport mechanism (HTTP, filesystem, cloud storage)\n- Authentication strategy\n- Retry and backoff behavior\n\nImplementations must not:\n- Parse content\n- Modify content semantics\n- Couple scraping logic to a specific parser",
            "members": {
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.html.scraper.BaseScraper.fetch",
                "signature": "<bound method Alias.signature of Alias('fetch', 'omniread.core.scraper.BaseScraper.fetch')>",
                "docstring": "Fetch raw content from the given source.\n\nImplementations must retrieve the content referenced by `source`\nand return it as raw bytes wrapped in a `Content` object.\n\nArgs:\n    source: Location identifier (URL, file path, S3 URI, etc.)\n    metadata: Optional hints for the scraper (headers, auth, etc.)\n\nReturns:\n    Content object containing raw bytes and metadata.\n    - Raw content bytes\n    - Source identifier\n    - Optional metadata\n\nRaises:\n    Exception: Retrieval-specific errors as defined by the implementation."
              }
            }
          },
          "HTMLScraper": {
            "name": "HTMLScraper",
            "kind": "class",
            "path": "omniread.html.scraper.HTMLScraper",
            "signature": "<bound method Class.signature of Class('HTMLScraper', 26, 134)>",
            "docstring": "Base HTML scraper using httpx.\n\nThis scraper retrieves HTML documents over HTTP(S) and returns them\nas raw content wrapped in a `Content` object.\n\nFetches raw bytes and metadata only.\nThe scraper:\n- Uses `httpx.Client` for HTTP requests\n- Enforces an HTML content type\n- Preserves HTTP response metadata\n\nThe scraper does not:\n- Parse HTML\n- Perform retries or backoff\n- Handle non-HTML responses",
            "members": {
              "content_type": {
                "name": "content_type",
                "kind": "attribute",
                "path": "omniread.html.scraper.HTMLScraper.content_type",
                "signature": null,
                "docstring": null
              },
              "validate_content_type": {
                "name": "validate_content_type",
                "kind": "function",
                "path": "omniread.html.scraper.HTMLScraper.validate_content_type",
                "signature": "<bound method Function.signature of Function('validate_content_type', 71, 94)>",
                "docstring": "Validate that the HTTP response contains HTML content.\n\nArgs:\n    response: HTTP response returned by `httpx`.\n\nRaises:\n    ValueError: If the `Content-Type` header is missing or does not\n        indicate HTML content."
              },
              "fetch": {
                "name": "fetch",
                "kind": "function",
                "path": "omniread.html.scraper.HTMLScraper.fetch",
                "signature": "<bound method Function.signature of Function('fetch', 96, 134)>",
                "docstring": "Fetch an HTML document from the given source.\n\nArgs:\n    source: URL of the HTML document.\n    metadata: Optional metadata to be merged into the returned content.\n\nReturns:\n    A `Content` instance containing:\n    - Raw HTML bytes\n    - Source URL\n    - HTML content type\n    - HTTP response metadata\n\nRaises:\n    httpx.HTTPError: If the HTTP request fails.\n    ValueError: If the response is not valid HTML."
              }
            }
          }
        }
      }
    }
  }
}