Build: 0.1.8
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-03-08 01:01:39 +05:30
parent 8701bf92ac
commit 9191de9dff
200 changed files with 130793 additions and 18277 deletions

View File

@@ -86,7 +86,9 @@
<header class="md-header" data-md-component="header">
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="omniread" class="md-header__button md-logo" aria-label="omniread" data-md-component="logo">
@@ -151,12 +153,19 @@
</label>
<nav class="md-search__options" aria-label="Search">
<a href="javascript:void(0)" class="md-search__icon md-icon" title="Share" aria-label="Share" data-clipboard data-clipboard-text="" data-md-component="search-share" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18 16.08c-.76 0-1.44.3-1.96.77L8.91 12.7c.05-.23.09-.46.09-.7s-.04-.47-.09-.7l7.05-4.11c.54.5 1.25.81 2.04.81a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3c0 .24.04.47.09.7L8.04 9.81C7.5 9.31 6.79 9 6 9a3 3 0 0 0-3 3 3 3 0 0 0 3 3c.79 0 1.5-.31 2.04-.81l7.12 4.15c-.05.21-.08.43-.08.66 0 1.61 1.31 2.91 2.92 2.91s2.92-1.3 2.92-2.91A2.92 2.92 0 0 0 18 16.08"/></svg>
</a>
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
<div class="md-search__suggest" data-md-component="search-suggest"></div>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
@@ -182,96 +191,6 @@
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
<div class="md-grid">
<ul class="md-tabs__list">
<li class="md-tabs__item">
<a href="../.." class="md-tabs__link">
Home
</a>
</li>
<li class="md-tabs__item">
<a href="../../core/" class="md-tabs__link">
Core API
</a>
</li>
<li class="md-tabs__item md-tabs__item--active">
<a href="../" class="md-tabs__link">
HTML Handling
</a>
</li>
<li class="md-tabs__item">
<a href="../../pdf/" class="md-tabs__link">
PDF Handling
</a>
</li>
</ul>
</div>
</nav>
<main class="md-main" data-md-component="main">
@@ -285,10 +204,8 @@
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="omniread" class="md-nav__button md-logo" aria-label="omniread" data-md-component="logo">
@@ -331,13 +248,27 @@
<li class="md-nav__item md-nav__item--nested">
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
@@ -346,8 +277,9 @@
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<div class="md-nav__link md-nav__container">
<a href="../../core/" class="md-nav__link ">
<span class="md-ellipsis">
@@ -356,8 +288,14 @@
</span>
<span class="md-nav__icon md-icon"></span>
</label>
</a>
<label class="md-nav__link " for="__nav_2" id="__nav_2_label" tabindex="">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
@@ -367,27 +305,6 @@
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../core/" class="md-nav__link">
<span class="md-ellipsis">
Core
</span>
</a>
</li>
@@ -474,6 +391,16 @@
@@ -481,7 +408,6 @@
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
@@ -491,8 +417,9 @@
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="">
<div class="md-nav__link md-nav__container">
<a href="../" class="md-nav__link ">
<span class="md-ellipsis">
@@ -501,8 +428,14 @@
</span>
<span class="md-nav__icon md-icon"></span>
</label>
</a>
<label class="md-nav__link " for="__nav_3" id="__nav_3_label" tabindex="">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
@@ -512,27 +445,6 @@
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../" class="md-nav__link">
<span class="md-ellipsis">
Html
</span>
</a>
</li>
@@ -624,6 +536,25 @@
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper--summary" class="md-nav__link">
<span class="md-ellipsis">
Summary
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#omniread.html.scraper-classes" class="md-nav__link">
<span class="md-ellipsis">
Classes
</span>
</a>
<nav class="md-nav" aria-label="Classes">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper.HTMLScraper" class="md-nav__link">
<span class="md-ellipsis">
HTMLScraper
@@ -634,6 +565,16 @@
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper.HTMLScraper-functions" class="md-nav__link">
<span class="md-ellipsis">
Functions
</span>
</a>
<nav class="md-nav" aria-label="Functions">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper.HTMLScraper.fetch" class="md-nav__link">
<span class="md-ellipsis">
fetch
@@ -659,6 +600,16 @@
</ul>
</nav>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
</ul>
@@ -685,13 +636,27 @@
<li class="md-nav__item md-nav__item--nested">
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
@@ -700,8 +665,9 @@
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<div class="md-nav__link md-nav__container">
<a href="../../pdf/" class="md-nav__link ">
<span class="md-ellipsis">
@@ -710,8 +676,14 @@
</span>
<span class="md-nav__icon md-icon"></span>
</label>
</a>
<label class="md-nav__link " for="__nav_4" id="__nav_4_label" tabindex="">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
@@ -721,27 +693,6 @@
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../pdf/" class="md-nav__link">
<span class="md-ellipsis">
Pdf
</span>
</a>
</li>
@@ -856,6 +807,25 @@
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper--summary" class="md-nav__link">
<span class="md-ellipsis">
Summary
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#omniread.html.scraper-classes" class="md-nav__link">
<span class="md-ellipsis">
Classes
</span>
</a>
<nav class="md-nav" aria-label="Classes">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper.HTMLScraper" class="md-nav__link">
<span class="md-ellipsis">
HTMLScraper
@@ -866,6 +836,16 @@
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper.HTMLScraper-functions" class="md-nav__link">
<span class="md-ellipsis">
Functions
</span>
</a>
<nav class="md-nav" aria-label="Functions">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#omniread.html.scraper.HTMLScraper.fetch" class="md-nav__link">
<span class="md-ellipsis">
fetch
@@ -891,6 +871,16 @@
</ul>
</nav>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
</ul>
@@ -917,7 +907,7 @@
<h2 id="omniread.html.scraper" class="doc doc-heading">
<span class="doc doc-object-name doc-module-name">omniread.html.scraper</span>
<code class="doc-symbol doc-symbol-heading doc-symbol-module"></code> <span class="doc doc-object-name doc-module-name">omniread.html.scraper</span>
</h2>
@@ -925,6 +915,8 @@
<div class="doc doc-contents first">
<p>HTML scraping implementation for OmniRead.</p>
<hr />
<h4 id="omniread.html.scraper--summary">Summary</h4>
<p>This module provides an HTTP-based scraper for retrieving HTML documents.
It implements the core <code>BaseScraper</code> contract using <code>httpx</code> as the transport
layer.</p>
@@ -946,38 +938,39 @@ layer.</p>
<h3 id="omniread.html.scraper-classes">Classes</h3>
<div class="doc doc-object doc-class">
<h3 id="omniread.html.scraper.HTMLScraper" class="doc doc-heading">
<span class="doc doc-object-name doc-class-name">HTMLScraper</span>
<h4 id="omniread.html.scraper.HTMLScraper" class="doc doc-heading">
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">HTMLScraper</span>
</h3>
<div class="doc-signature highlight"><pre><span></span><code><span class="nf">HTMLScraper</span><span class="p">(</span><span class="o">*</span><span class="p">,</span> <span class="n">client</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">httpx</span><span class="o">.</span><span class="n">Client</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">15.0</span><span class="p">,</span> <span class="n">headers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">follow_redirects</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span>
</code></pre></div>
</h4>
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">HTMLScraper</span><span class="p">(</span><span class="o">*</span><span class="p">,</span> <span class="n">client</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">httpx</span><span class="o">.</span><span class="n">Client</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">15.0</span><span class="p">,</span> <span class="n">headers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">follow_redirects</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span>
</span></code></pre></div></td></tr></table></div>
<div class="doc doc-contents ">
<p class="doc doc-class-bases">
Bases: <code><a class="autorefs autorefs-internal" title="omniread.core.scraper.BaseScraper" href="../../core/scraper/#omniread.core.scraper.BaseScraper">BaseScraper</a></code></p>
Bases: <code><a class="autorefs autorefs-internal" title="omniread.core.scraper.BaseScraper" href="../../omniread/core/scraper/#omniread.core.scraper.BaseScraper">BaseScraper</a></code></p>
<p>Base HTML scraper using httpx.</p>
<p>This scraper retrieves HTML documents over HTTP(S) and returns them
as raw content wrapped in a <code>Content</code> object.</p>
<p>Fetches raw bytes and metadata only.
The scraper:
- Uses <code>httpx.Client</code> for HTTP requests
- Enforces an HTML content type
- Preserves HTTP response metadata</p>
<p>The scraper does not:
- Parse HTML
- Perform retries or backoff
- Handle non-HTML responses</p>
<details class="notes" open>
<summary>Notes</summary>
<p><strong>Responsibilities:</strong></p>
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code>- This scraper retrieves HTML documents over HTTP(S) and returns them as raw content wrapped in a `Content` object
- Fetches raw bytes and metadata only. The scraper uses `httpx.Client` for HTTP requests, enforces an HTML content type, preserves HTTP response metadata
</code></pre></div></td></tr></table></div>
<p><strong>Constraints:</strong></p>
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- The scraper does not: Parse HTML, perform retries or backoff, handle non-HTML responses
</code></pre></div></td></tr></table></div>
</details>
<p>Initialize the HTML scraper.</p>
@@ -995,12 +988,11 @@ The scraper:
<tr class="doc-section-item">
<td><code>client</code></td>
<td>
<code><span title="typing.Optional">Optional</span>[<span title="httpx.Client">Client</span>]</code>
<code><span title="httpx.Client">Client</span> | None</code>
</td>
<td>
<div class="doc-md-description">
<p>Optional pre-configured <code>httpx.Client</code>. If omitted,
a client is created internally.</p>
<p>Optional pre-configured <code>httpx.Client</code>. If omitted, a client is created internally.</p>
</div>
</td>
<td>
@@ -1063,18 +1055,18 @@ a client is created internally.</p>
<h5 id="omniread.html.scraper.HTMLScraper-functions">Functions</h5>
<div class="doc doc-object doc-function">
<h4 id="omniread.html.scraper.HTMLScraper.fetch" class="doc doc-heading">
<span class="doc doc-object-name doc-function-name">fetch</span>
<h6 id="omniread.html.scraper.HTMLScraper.fetch" class="doc doc-heading">
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">fetch</span>
</h4>
<div class="doc-signature highlight"><pre><span></span><code><span class="nf">fetch</span><span class="p">(</span><span class="n">source</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">metadata</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Content</span>
</code></pre></div>
</h6>
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">fetch</span><span class="p">(</span><span class="n">source</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">metadata</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Content</span>
</span></code></pre></div></td></tr></table></div>
<div class="doc doc-contents ">
@@ -1128,66 +1120,18 @@ a client is created internally.</p>
<table>
<thead>
<tr>
<th>Type</th>
<th>Name</th> <th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="../../core/content/#omniread.core.content.Content">Content</a></code>
<td><code>Content</code></td> <td>
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="../../omniread/core/content/#omniread.core.content.Content">Content</a></code>
</td>
<td>
<div class="doc-md-description">
<p>A <code>Content</code> instance containing:</p>
</div>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="../../core/content/#omniread.core.content.Content">Content</a></code>
</td>
<td>
<div class="doc-md-description">
<ul>
<li>Raw HTML bytes</li>
</ul>
</div>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="../../core/content/#omniread.core.content.Content">Content</a></code>
</td>
<td>
<div class="doc-md-description">
<ul>
<li>Source URL</li>
</ul>
</div>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="../../core/content/#omniread.core.content.Content">Content</a></code>
</td>
<td>
<div class="doc-md-description">
<ul>
<li>HTML content type</li>
</ul>
</div>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="../../core/content/#omniread.core.content.Content">Content</a></code>
</td>
<td>
<div class="doc-md-description">
<ul>
<li>HTTP response metadata</li>
</ul>
<p>A <code>Content</code> instance containing raw HTML bytes, source URL, HTML content type, and HTTP response metadata.</p>
</div>
</td>
</tr>
@@ -1234,13 +1178,13 @@ a client is created internally.</p>
<div class="doc doc-object doc-function">
<h4 id="omniread.html.scraper.HTMLScraper.validate_content_type" class="doc doc-heading">
<span class="doc doc-object-name doc-function-name">validate_content_type</span>
<h6 id="omniread.html.scraper.HTMLScraper.validate_content_type" class="doc doc-heading">
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">validate_content_type</span>
</h4>
<div class="doc-signature highlight"><pre><span></span><code><span class="nf">validate_content_type</span><span class="p">(</span><span class="n">response</span><span class="p">:</span> <span class="n">httpx</span><span class="o">.</span><span class="n">Response</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span>
</code></pre></div>
</h6>
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">validate_content_type</span><span class="p">(</span><span class="n">response</span><span class="p">:</span> <span class="n">httpx</span><span class="o">.</span><span class="n">Response</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span>
</span></code></pre></div></td></tr></table></div>
<div class="doc doc-contents ">
@@ -1291,8 +1235,7 @@ a client is created internally.</p>
</td>
<td>
<div class="doc-md-description">
<p>If the <code>Content-Type</code> header is missing or does not
indicate HTML content.</p>
<p>If the <code>Content-Type</code> header is missing or does not indicate HTML content.</p>
</div>
</td>
</tr>
@@ -1336,6 +1279,8 @@ indicate HTML content.</p>
</div>
<script>var tabs=__md_get("__tabs");if(Array.isArray(tabs))e:for(var set of document.querySelectorAll(".tabbed-set")){var labels=set.querySelector(".tabbed-labels");for(var tab of tabs)for(var label of labels.getElementsByTagName("label"))if(label.innerText.trim()===tab){var input=document.getElementById(label.htmlFor);input.checked=!0;continue e}}</script>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
@@ -1373,7 +1318,7 @@ indicate HTML content.</p>
<script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.expand", "navigation.top", "navigation.instant", "content.code.copy", "content.code.annotate"], "search": "../../assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script id="__config" type="application/json">{"base": "../..", "features": ["navigation.sections", "navigation.expand", "navigation.top", "navigation.instant", "navigation.tracking", "navigation.indexes", "content.code.copy", "content.code.annotate", "content.tabs.link", "content.action.edit", "search.highlight", "search.share", "search.suggest"], "search": "../../assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.f55a23d4.min.js"></script>