All checks were successful
continuous-integration/drone/push Build is passing
3659 lines
107 KiB
HTML
3659 lines
107 KiB
HTML
|
|
<!doctype html>
|
|
<html lang="en" class="no-js">
|
|
<head>
|
|
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="next" href="core/">
|
|
|
|
|
|
<link rel="icon" href="assets/images/favicon.png">
|
|
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.23">
|
|
|
|
|
|
|
|
<title>omniread</title>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="assets/stylesheets/main.84d31ad4.min.css">
|
|
|
|
|
|
<link rel="stylesheet" href="assets/stylesheets/palette.06af60db.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Inter:300,300i,400,400i,700,700i%7CJetBrains+Mono:400,400i,700,700i&display=fallback">
|
|
<style>:root{--md-text-font:"Inter";--md-code-font:"JetBrains Mono"}</style>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="assets/_mkdocstrings.css">
|
|
|
|
<script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</head>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<body dir="ltr" data-md-color-scheme="slate" data-md-color-primary="deep-purple" data-md-color-accent="cyan">
|
|
|
|
|
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
|
<label class="md-overlay" for="__drawer"></label>
|
|
<div data-md-component="skip">
|
|
|
|
|
|
<a href="#omniread_1" class="md-skip">
|
|
Skip to content
|
|
</a>
|
|
|
|
</div>
|
|
<div data-md-component="announce">
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<header class="md-header md-header--shadow" data-md-component="header">
|
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
|
<a href="." title="omniread" class="md-header__button md-logo" aria-label="omniread" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
<label class="md-header__button md-icon" for="__drawer">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
|
</label>
|
|
<div class="md-header__title" data-md-component="header-title">
|
|
<div class="md-header__ellipsis">
|
|
<div class="md-header__topic">
|
|
<span class="md-ellipsis">
|
|
omniread
|
|
</span>
|
|
</div>
|
|
<div class="md-header__topic" data-md-component="header-topic">
|
|
<span class="md-ellipsis">
|
|
|
|
Home
|
|
|
|
</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<form class="md-header__option" data-md-component="palette">
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="deep-purple" data-md-color-accent="cyan" aria-hidden="true" type="radio" name="__palette" id="__palette_0">
|
|
|
|
|
|
</form>
|
|
|
|
|
|
|
|
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-header__button md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
</label>
|
|
<div class="md-search" data-md-component="search" role="dialog">
|
|
<label class="md-search__overlay" for="__search"></label>
|
|
<div class="md-search__inner" role="search">
|
|
<form class="md-search__form" name="search">
|
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
|
<label class="md-search__icon md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
|
|
</label>
|
|
<nav class="md-search__options" aria-label="Search">
|
|
|
|
<a href="javascript:void(0)" class="md-search__icon md-icon" title="Share" aria-label="Share" data-clipboard data-clipboard-text="" data-md-component="search-share" tabindex="-1">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18 16.08c-.76 0-1.44.3-1.96.77L8.91 12.7c.05-.23.09-.46.09-.7s-.04-.47-.09-.7l7.05-4.11c.54.5 1.25.81 2.04.81a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3c0 .24.04.47.09.7L8.04 9.81C7.5 9.31 6.79 9 6 9a3 3 0 0 0-3 3 3 3 0 0 0 3 3c.79 0 1.5-.31 2.04-.81l7.12 4.15c-.05.21-.08.43-.08.66 0 1.61 1.31 2.91 2.92 2.91s2.92-1.3 2.92-2.91A2.92 2.92 0 0 0 18 16.08"/></svg>
|
|
</a>
|
|
|
|
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
|
|
</button>
|
|
</nav>
|
|
|
|
<div class="md-search__suggest" data-md-component="search-suggest"></div>
|
|
|
|
</form>
|
|
<div class="md-search__output">
|
|
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
|
|
<div class="md-search-result" data-md-component="search-result">
|
|
<div class="md-search-result__meta">
|
|
Initializing search
|
|
</div>
|
|
<ol class="md-search-result__list" role="presentation"></ol>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
</nav>
|
|
|
|
</header>
|
|
|
|
<div class="md-container" data-md-component="container">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<main class="md-main" data-md-component="main">
|
|
<div class="md-main__inner md-grid">
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
|
<label class="md-nav__title" for="__drawer">
|
|
<a href="." title="omniread" class="md-nav__button md-logo" aria-label="omniread" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
omniread
|
|
</label>
|
|
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active">
|
|
|
|
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Home
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<a href="." class="md-nav__link md-nav__link--active">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Home
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
omniread
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="omniread">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--summary" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Summary
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--installation" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Installation
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--quick-start" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Quick start
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--public-api" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Public API
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread-classes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Classes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Classes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Content
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Content">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.content_type" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
content_type
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.metadata" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
metadata
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.raw" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
raw
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.source" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
source
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
ContentType
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="ContentType">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.HTML" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
HTML
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.JSON" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
JSON
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.PDF" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
PDF
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.XML" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
XML
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.FileSystemPDFClient" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
FileSystemPDFClient
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="FileSystemPDFClient">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.FileSystemPDFClient-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.FileSystemPDFClient.fetch" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
fetch
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
HTMLParser
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="HTMLParser">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.supported_types" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supported_types
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_div" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_div
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_link" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_link
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_meta" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_meta
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_table" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_table
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.supports" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supports
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
HTMLScraper
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="HTMLScraper">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper.fetch" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
fetch
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper.validate_content_type" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
validate_content_type
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
PDFParser
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="PDFParser">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser.supported_types" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supported_types
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser.parse" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser.supports" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supports
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFScraper" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
PDFScraper
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="PDFScraper">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFScraper-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFScraper.fetch" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
fetch
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="core/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Core API
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_2" id="__nav_2_label" tabindex="">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Core API
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="core/content/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Content
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="core/parser/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Parser
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="core/scraper/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Scraper
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_3" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="html/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
HTML Handling
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_3" id="__nav_3_label" tabindex="">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
HTML Handling
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="html/parser/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Parser
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="html/scraper/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Scraper
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_4" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="pdf/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
PDF Handling
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_4" id="__nav_4_label" tabindex="">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
PDF Handling
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="pdf/client/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Client
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="pdf/parser/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Parser
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="pdf/scraper/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Scraper
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
omniread
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="omniread">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--summary" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Summary
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--installation" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Installation
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--quick-start" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Quick start
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread--public-api" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Public API
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread-classes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Classes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Classes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Content
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Content">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.content_type" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
content_type
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.metadata" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
metadata
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.raw" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
raw
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.Content.source" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
source
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
ContentType
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="ContentType">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.HTML" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
HTML
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.JSON" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
JSON
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.PDF" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
PDF
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.ContentType.XML" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
XML
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.FileSystemPDFClient" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
FileSystemPDFClient
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="FileSystemPDFClient">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.FileSystemPDFClient-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.FileSystemPDFClient.fetch" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
fetch
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
HTMLParser
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="HTMLParser">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.supported_types" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supported_types
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_div" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_div
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_link" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_link
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_meta" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_meta
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.parse_table" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse_table
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLParser.supports" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supports
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
HTMLScraper
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="HTMLScraper">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper.fetch" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
fetch
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.HTMLScraper.validate_content_type" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
validate_content_type
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
PDFParser
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="PDFParser">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser-attributes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Attributes
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Attributes">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser.supported_types" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supported_types
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser.parse" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
parse
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFParser.supports" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
supports
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFScraper" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
PDFScraper
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="PDFScraper">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFScraper-functions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Functions
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Functions">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#omniread.PDFScraper.fetch" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
fetch
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-content" data-md-component="content">
|
|
<article class="md-content__inner md-typeset">
|
|
|
|
|
|
|
|
|
|
|
|
<h1 id="omniread_1">omniread</h1>
|
|
|
|
|
|
<div class="doc doc-object doc-module">
|
|
|
|
|
|
|
|
<h2 id="omniread" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-module"></code> <span class="doc doc-object-name doc-module-name">omniread</span>
|
|
|
|
|
|
</h2>
|
|
|
|
<div class="doc doc-contents first">
|
|
|
|
<p>OmniRead — format-agnostic content acquisition and parsing framework.</p>
|
|
<hr />
|
|
<h4 id="omniread--summary">Summary</h4>
|
|
<p>OmniRead provides a <strong>cleanly layered architecture</strong> for fetching, parsing,
|
|
and normalizing content from heterogeneous sources such as HTML documents
|
|
and PDF files.</p>
|
|
<p>The library is structured around three core concepts:</p>
|
|
<ol>
|
|
<li><strong>Content</strong>: A canonical, format-agnostic container representing raw content bytes and minimal contextual metadata.</li>
|
|
<li><strong>Scrapers</strong>: Components responsible for <em>acquiring</em> raw content from a source (HTTP, filesystem, object storage, etc.). Scrapers never interpret content.</li>
|
|
<li><strong>Parsers</strong>: Components responsible for <em>interpreting</em> acquired content and converting it into structured, typed representations.</li>
|
|
</ol>
|
|
<p>OmniRead deliberately separates these responsibilities to ensure:
|
|
- Clear boundaries between IO and interpretation
|
|
- Replaceable implementations per format
|
|
- Predictable, testable behavior</p>
|
|
<hr />
|
|
<h4 id="omniread--installation">Installation</h4>
|
|
<p>Install OmniRead using pip:</p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>pip install omniread
|
|
</code></pre></div></td></tr></table></div>
|
|
<p>Or with Poetry:</p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>poetry add omniread
|
|
</code></pre></div></td></tr></table></div>
|
|
<hr />
|
|
<h4 id="omniread--quick-start">Quick start</h4>
|
|
<p>HTML example:</p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 1</span>
|
|
<span class="normal"> 2</span>
|
|
<span class="normal"> 3</span>
|
|
<span class="normal"> 4</span>
|
|
<span class="normal"> 5</span>
|
|
<span class="normal"> 6</span>
|
|
<span class="normal"> 7</span>
|
|
<span class="normal"> 8</span>
|
|
<span class="normal"> 9</span>
|
|
<span class="normal">10</span>
|
|
<span class="normal">11</span></pre></div></td><td class="code"><div><pre><span></span><code>from omniread import HTMLScraper, HTMLParser
|
|
|
|
scraper = HTMLScraper()
|
|
content = scraper.fetch("https://example.com")
|
|
|
|
class TitleParser(HTMLParser[str]):
|
|
def parse(self) -> str:
|
|
return self._soup.title.string
|
|
|
|
parser = TitleParser(content)
|
|
title = parser.parse()
|
|
</code></pre></div></td></tr></table></div>
|
|
<p>PDF example:</p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 1</span>
|
|
<span class="normal"> 2</span>
|
|
<span class="normal"> 3</span>
|
|
<span class="normal"> 4</span>
|
|
<span class="normal"> 5</span>
|
|
<span class="normal"> 6</span>
|
|
<span class="normal"> 7</span>
|
|
<span class="normal"> 8</span>
|
|
<span class="normal"> 9</span>
|
|
<span class="normal">10</span>
|
|
<span class="normal">11</span>
|
|
<span class="normal">12</span>
|
|
<span class="normal">13</span>
|
|
<span class="normal">14</span></pre></div></td><td class="code"><div><pre><span></span><code>from omniread import FileSystemPDFClient, PDFScraper, PDFParser
|
|
from pathlib import Path
|
|
|
|
client = FileSystemPDFClient()
|
|
scraper = PDFScraper(client=client)
|
|
content = scraper.fetch(Path("document.pdf"))
|
|
|
|
class TextPDFParser(PDFParser[str]):
|
|
def parse(self) -> str:
|
|
# implement PDF text extraction
|
|
...
|
|
|
|
parser = TextPDFParser(content)
|
|
result = parser.parse()
|
|
</code></pre></div></td></tr></table></div>
|
|
<hr />
|
|
<h4 id="omniread--public-api">Public API</h4>
|
|
<p>This module re-exports the <strong>recommended public entry points</strong> of OmniRead.
|
|
Consumers are encouraged to import from this namespace rather than from
|
|
format-specific submodules directly, unless advanced customization is
|
|
required.</p>
|
|
<p><strong>Core:</strong>
|
|
- Content
|
|
- ContentType</p>
|
|
<p><strong>HTML:</strong>
|
|
- HTMLScraper
|
|
- HTMLParser</p>
|
|
<p><strong>PDF:</strong>
|
|
- FileSystemPDFClient
|
|
- PDFScraper
|
|
- PDFParser</p>
|
|
<p><strong>Core Philosophy:</strong>
|
|
<code>OmniRead</code> is designed as a <strong>decoupled content engine</strong>:
|
|
1. <strong>Separation of Concerns</strong>: Scrapers <em>fetch</em>, Parsers <em>interpret</em>. Neither knows about the other.
|
|
2. <strong>Normalized Exchange</strong>: All components communicate via the <code>Content</code> model, ensuring a consistent contract.
|
|
3. <strong>Format Agnosticism</strong>: The core logic is independent of whether the input is HTML, PDF, or JSON.</p>
|
|
<hr />
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<h3 id="omniread-classes">Classes</h3>
|
|
|
|
<div class="doc doc-object doc-class">
|
|
|
|
|
|
|
|
<h4 id="omniread.Content" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">Content</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-dataclass"><code>dataclass</code></small>
|
|
</span>
|
|
|
|
</h4>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">Content</span><span class="p">(</span><span class="n">raw</span><span class="p">:</span> <span class="nb">bytes</span><span class="p">,</span> <span class="n">source</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">content_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ContentType</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> <span class="n">metadata</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
|
|
<p>Normalized representation of extracted content.</p>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
|
|
<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code>- A `Content` instance represents a raw content payload along with minimal contextual metadata describing its origin and type
|
|
- This class is the primary exchange format between Scrapers, Parsers, and Downstream consumers
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
<h5 id="omniread.Content-attributes">Attributes</h5>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.Content.content_type" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">content_type</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">content_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ContentType</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Optional MIME type of the content, if known.</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.Content.metadata" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">metadata</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">metadata</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Optional, implementation-defined metadata associated with the content (e.g., headers, encoding hints, extraction notes).</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.Content.raw" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">raw</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">raw</span><span class="p">:</span> <span class="nb">bytes</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Raw content bytes as retrieved from the source.</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.Content.source" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">source</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">source</span><span class="p">:</span> <span class="nb">str</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Identifier of the content origin (URL, file path, or logical name).</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-class">
|
|
|
|
|
|
|
|
<h4 id="omniread.ContentType" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">ContentType</span>
|
|
|
|
|
|
</h4>
|
|
|
|
|
|
<div class="doc doc-contents ">
|
|
<p class="doc doc-class-bases">
|
|
Bases: <code>str</code>, <code><span title="enum.Enum">Enum</span></code></p>
|
|
|
|
|
|
<p>Supported MIME types for extracted content.</p>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Guarantees:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
|
|
<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code>- This enum represents the declared or inferred media type of the content source
|
|
- It is primarily used for routing content to the appropriate parser or downstream consumer
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
<h5 id="omniread.ContentType-attributes">Attributes</h5>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.ContentType.HTML" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">HTML</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">HTML</span> <span class="o">=</span> <span class="s1">'text/html'</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>HTML document content.</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.ContentType.JSON" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">JSON</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">JSON</span> <span class="o">=</span> <span class="s1">'application/json'</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>JSON document content.</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.ContentType.PDF" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">PDF</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">PDF</span> <span class="o">=</span> <span class="s1">'application/pdf'</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>PDF document content.</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.ContentType.XML" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">XML</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">XML</span> <span class="o">=</span> <span class="s1">'application/xml'</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>XML document content.</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-class">
|
|
|
|
|
|
|
|
<h4 id="omniread.FileSystemPDFClient" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">FileSystemPDFClient</span>
|
|
|
|
|
|
</h4>
|
|
|
|
|
|
<div class="doc doc-contents ">
|
|
<p class="doc doc-class-bases">
|
|
Bases: <code><a class="autorefs autorefs-internal" title="omniread.pdf.client.BasePDFClient" href="pdf/client/#omniread.pdf.client.BasePDFClient">BasePDFClient</a></code></p>
|
|
|
|
|
|
<p>PDF client that reads from the local filesystem.</p>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Guarantees:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- This client reads PDF files directly from the disk and returns their raw binary contents
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<h5 id="omniread.FileSystemPDFClient-functions">Functions</h5>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.FileSystemPDFClient.fetch" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">fetch</span>
|
|
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">fetch</span><span class="p">(</span><span class="n">path</span><span class="p">:</span> <span class="n">Path</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bytes</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Read a PDF file from the local filesystem.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>path</code></td>
|
|
<td>
|
|
<code><span title="pathlib.Path">Path</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Filesystem path to the PDF file.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>bytes</code></td> <td>
|
|
<code>bytes</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Raw PDF bytes.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Raises:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>FileNotFoundError</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>If the path does not exist.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>ValueError</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>If the path exists but is not a file.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-class">
|
|
|
|
|
|
|
|
<h4 id="omniread.HTMLParser" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">HTMLParser</span>
|
|
|
|
|
|
</h4>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">HTMLParser</span><span class="p">(</span><span class="n">content</span><span class="p">:</span> <span class="n">Content</span><span class="p">,</span> <span class="n">features</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">'html.parser'</span><span class="p">)</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
<p class="doc doc-class-bases">
|
|
Bases: <code><a class="autorefs autorefs-internal" title="omniread.core.parser.BaseParser" href="omniread/core/parser/#omniread.core.parser.BaseParser">BaseParser</a>[<span title="omniread.html.parser.T">T</span>]</code>, <code><span title="typing.Generic">Generic</span>[<span title="omniread.html.parser.T">T</span>]</code></p>
|
|
|
|
|
|
<p>Base HTML parser.</p>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
|
|
<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code>- This class extends the core `BaseParser` with HTML-specific behavior, including DOM parsing via BeautifulSoup and reusable extraction helpers
|
|
- Provides reusable helpers for HTML extraction. Concrete parsers must explicitly define the return type
|
|
</code></pre></div></td></tr></table></div>
|
|
<p><strong>Guarantees:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- Characteristics: Accepts only HTML content, owns a parsed BeautifulSoup DOM tree, provides pure helper utilities for common HTML structures
|
|
</code></pre></div></td></tr></table></div>
|
|
<p><strong>Constraints:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- Concrete subclasses must define the output type `T` and implement the `parse()` method
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
<p>Initialize the HTML parser.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>content</code></td>
|
|
<td>
|
|
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="omniread/core/content/#omniread.core.content.Content">Content</a></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>HTML content to be parsed.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td><code>features</code></td>
|
|
<td>
|
|
<code>str</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>BeautifulSoup parser backend to use (e.g., 'html.parser', 'lxml').</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>'html.parser'</code>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Raises:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>ValueError</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>If the content is empty or not valid HTML.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
<h5 id="omniread.HTMLParser-attributes">Attributes</h5>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.HTMLParser.supported_types" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">supported_types</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">supported_types</span><span class="p">:</span> <span class="nb">set</span><span class="p">[</span><span class="n">ContentType</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="n">HTML</span><span class="p">}</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Set of content types supported by this parser (HTML only).</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<h5 id="omniread.HTMLParser-functions">Functions</h5>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLParser.parse" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">parse</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-abstractmethod"><code>abstractmethod</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">parse</span><span class="p">()</span> <span class="o">-></span> <span class="n">T</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Fully parse the HTML content into structured output.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>T</code></td> <td>
|
|
<code><span title="omniread.html.parser.T">T</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Parsed representation of type <code>T</code>.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- Implementations must fully interpret the HTML DOM and return a deterministic, structured output
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLParser.parse_div" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">parse_div</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-staticmethod"><code>staticmethod</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">parse_div</span><span class="p">(</span><span class="n">div</span><span class="p">:</span> <span class="n">Tag</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">separator</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">' '</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Extract normalized text from a <code><div></code> element.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>div</code></td>
|
|
<td>
|
|
<code><span title="bs4.Tag">Tag</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>BeautifulSoup tag representing a <code><div></code>.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td><code>separator</code></td>
|
|
<td>
|
|
<code>str</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>String used to separate text nodes.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>' '</code>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>str</code></td> <td>
|
|
<code>str</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Flattened, whitespace-normalized text content.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLParser.parse_link" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">parse_link</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-staticmethod"><code>staticmethod</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">parse_link</span><span class="p">(</span><span class="n">a</span><span class="p">:</span> <span class="n">Tag</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Extract the hyperlink reference from an <code><a></code> element.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>a</code></td>
|
|
<td>
|
|
<code><span title="bs4.Tag">Tag</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>BeautifulSoup tag representing an anchor.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code><span title="typing.Optional">Optional</span>[str]</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Optional[str]:
|
|
The value of the <code>href</code> attribute, or None if absent.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLParser.parse_meta" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">parse_meta</span>
|
|
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">parse_meta</span><span class="p">()</span> <span class="o">-></span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Extract high-level metadata from the HTML document.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code><span title="typing.dict">dict</span>[str, <span title="typing.Any">Any</span>]</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>dict[str, Any]:
|
|
Dictionary containing extracted metadata.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
|
|
<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code>- Extract high-level metadata from the HTML document
|
|
- This includes: Document title, `<meta>` tag name/property → content mappings
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLParser.parse_table" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">parse_table</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-staticmethod"><code>staticmethod</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">parse_table</span><span class="p">(</span><span class="n">table</span><span class="p">:</span> <span class="n">Tag</span><span class="p">)</span> <span class="o">-></span> <span class="nb">list</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Parse an HTML table into a 2D list of strings.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>table</code></td>
|
|
<td>
|
|
<code><span title="bs4.Tag">Tag</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>BeautifulSoup tag representing a <code><table></code>.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code><span title="typing.list">list</span>[<span title="typing.list">list</span>[str]]</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>list[list[str]]:
|
|
A list of rows, where each row is a list of cell text values.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLParser.supports" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">supports</span>
|
|
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">supports</span><span class="p">()</span> <span class="o">-></span> <span class="nb">bool</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Check whether this parser supports the content's type.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>bool</code></td> <td>
|
|
<code>bool</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>True if the content type is supported; False otherwise.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-class">
|
|
|
|
|
|
|
|
<h4 id="omniread.HTMLScraper" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">HTMLScraper</span>
|
|
|
|
|
|
</h4>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">HTMLScraper</span><span class="p">(</span><span class="o">*</span><span class="p">,</span> <span class="n">client</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">httpx</span><span class="o">.</span><span class="n">Client</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">15.0</span><span class="p">,</span> <span class="n">headers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">follow_redirects</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
<p class="doc doc-class-bases">
|
|
Bases: <code><a class="autorefs autorefs-internal" title="omniread.core.scraper.BaseScraper" href="omniread/core/scraper/#omniread.core.scraper.BaseScraper">BaseScraper</a></code></p>
|
|
|
|
|
|
<p>Base HTML scraper using httpx.</p>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
|
|
<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code>- This scraper retrieves HTML documents over HTTP(S) and returns them as raw content wrapped in a `Content` object
|
|
- Fetches raw bytes and metadata only. The scraper uses `httpx.Client` for HTTP requests, enforces an HTML content type, preserves HTTP response metadata
|
|
</code></pre></div></td></tr></table></div>
|
|
<p><strong>Constraints:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- The scraper does not: Parse HTML, perform retries or backoff, handle non-HTML responses
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
<p>Initialize the HTML scraper.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>client</code></td>
|
|
<td>
|
|
<code><span title="httpx.Client">Client</span> | None</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Optional pre-configured <code>httpx.Client</code>. If omitted, a client is created internally.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>None</code>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td><code>timeout</code></td>
|
|
<td>
|
|
<code>float</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Request timeout in seconds.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>15.0</code>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td><code>headers</code></td>
|
|
<td>
|
|
<code><span title="typing.Optional">Optional</span>[<span title="typing.Mapping">Mapping</span>[str, str]]</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Optional default HTTP headers.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>None</code>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td><code>follow_redirects</code></td>
|
|
<td>
|
|
<code>bool</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Whether to follow HTTP redirects.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>True</code>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<h5 id="omniread.HTMLScraper-functions">Functions</h5>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLScraper.fetch" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">fetch</span>
|
|
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">fetch</span><span class="p">(</span><span class="n">source</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">metadata</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Content</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Fetch an HTML document from the given source.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>source</code></td>
|
|
<td>
|
|
<code>str</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>URL of the HTML document.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td><code>metadata</code></td>
|
|
<td>
|
|
<code><span title="typing.Optional">Optional</span>[<span title="typing.Mapping">Mapping</span>[str, <span title="typing.Any">Any</span>]]</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Optional metadata to be merged into the returned content.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>None</code>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>Content</code></td> <td>
|
|
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="omniread/core/content/#omniread.core.content.Content">Content</a></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>A <code>Content</code> instance containing raw HTML bytes, source URL, HTML content type, and HTTP response metadata.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Raises:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code><span title="httpx.HTTPError">HTTPError</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>If the HTTP request fails.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>ValueError</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>If the response is not valid HTML.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.HTMLScraper.validate_content_type" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">validate_content_type</span>
|
|
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">validate_content_type</span><span class="p">(</span><span class="n">response</span><span class="p">:</span> <span class="n">httpx</span><span class="o">.</span><span class="n">Response</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Validate that the HTTP response contains HTML content.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>response</code></td>
|
|
<td>
|
|
<code><span title="httpx.Response">Response</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>HTTP response returned by <code>httpx</code>.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Raises:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>ValueError</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>If the <code>Content-Type</code> header is missing or does not indicate HTML content.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-class">
|
|
|
|
|
|
|
|
<h4 id="omniread.PDFParser" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">PDFParser</span>
|
|
|
|
|
|
</h4>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">PDFParser</span><span class="p">(</span><span class="n">content</span><span class="p">:</span> <span class="n">Content</span><span class="p">)</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
<p class="doc doc-class-bases">
|
|
Bases: <code><a class="autorefs autorefs-internal" title="omniread.core.parser.BaseParser" href="omniread/core/parser/#omniread.core.parser.BaseParser">BaseParser</a>[<span title="omniread.pdf.parser.T">T</span>]</code>, <code><span title="typing.Generic">Generic</span>[<span title="omniread.pdf.parser.T">T</span>]</code></p>
|
|
|
|
|
|
<p>Base PDF parser.</p>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- This class enforces PDF content-type compatibility and provides the extension point for implementing concrete PDF parsing strategies
|
|
</code></pre></div></td></tr></table></div>
|
|
<p><strong>Constraints:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- Concrete implementations must: Define the output type `T`, implement the `parse()` method
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
<p>Initialize the parser with content to be parsed.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>content</code></td>
|
|
<td>
|
|
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="omniread/core/content/#omniread.core.content.Content">Content</a></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Content instance to be parsed.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Raises:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>ValueError</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>If the content type is not supported by this parser.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
<h5 id="omniread.PDFParser-attributes">Attributes</h5>
|
|
|
|
<div class="doc doc-object doc-attribute">
|
|
|
|
|
|
|
|
<h6 id="omniread.PDFParser.supported_types" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-attribute"></code> <span class="doc doc-object-name doc-attribute-name">supported_types</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-class-attribute"><code>class-attribute</code></small>
|
|
<small class="doc doc-label doc-label-instance-attribute"><code>instance-attribute</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="n">supported_types</span><span class="p">:</span> <span class="nb">set</span><span class="p">[</span><span class="n">ContentType</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="n">PDF</span><span class="p">}</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Set of content types supported by this parser (PDF only).</p>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<h5 id="omniread.PDFParser-functions">Functions</h5>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.PDFParser.parse" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">parse</span>
|
|
|
|
|
|
<span class="doc doc-labels">
|
|
<small class="doc doc-label doc-label-abstractmethod"><code>abstractmethod</code></small>
|
|
</span>
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">parse</span><span class="p">()</span> <span class="o">-></span> <span class="n">T</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Parse PDF content into a structured output.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>T</code></td> <td>
|
|
<code><span title="omniread.pdf.parser.T">T</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Parsed representation of type <code>T</code>.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Raises:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>Exception</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Parsing-specific errors as defined by the implementation.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- Implementations must fully interpret the PDF binary payload and return a deterministic, structured output
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.PDFParser.supports" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">supports</span>
|
|
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">supports</span><span class="p">()</span> <span class="o">-></span> <span class="nb">bool</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Check whether this parser supports the content's type.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>bool</code></td> <td>
|
|
<code>bool</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>True if the content type is supported; False otherwise.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<div class="doc doc-object doc-class">
|
|
|
|
|
|
|
|
<h4 id="omniread.PDFScraper" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-class"></code> <span class="doc doc-object-name doc-class-name">PDFScraper</span>
|
|
|
|
|
|
</h4>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">PDFScraper</span><span class="p">(</span><span class="o">*</span><span class="p">,</span> <span class="n">client</span><span class="p">:</span> <span class="n">BasePDFClient</span><span class="p">)</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
<p class="doc doc-class-bases">
|
|
Bases: <code><a class="autorefs autorefs-internal" title="omniread.core.scraper.BaseScraper" href="omniread/core/scraper/#omniread.core.scraper.BaseScraper">BaseScraper</a></code></p>
|
|
|
|
|
|
<p>Scraper for PDF sources.</p>
|
|
|
|
|
|
<details class="notes" open>
|
|
<summary>Notes</summary>
|
|
<p><strong>Responsibilities:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
|
|
<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code>- Delegates byte retrieval to a PDF client and normalizes output into Content
|
|
- Preserves caller-provided metadata
|
|
</code></pre></div></td></tr></table></div>
|
|
<p><strong>Constraints:</strong></p>
|
|
<div class="language-text highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>- The scraper: Does not perform parsing or interpretation, does not assume a specific storage backend
|
|
</code></pre></div></td></tr></table></div>
|
|
</details>
|
|
<p>Initialize the PDF scraper.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>client</code></td>
|
|
<td>
|
|
<code><a class="autorefs autorefs-internal" title="omniread.pdf.client.BasePDFClient" href="pdf/client/#omniread.pdf.client.BasePDFClient">BasePDFClient</a></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>PDF client responsible for retrieving raw PDF bytes.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
|
|
|
|
<div class="doc doc-children">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<h5 id="omniread.PDFScraper-functions">Functions</h5>
|
|
|
|
<div class="doc doc-object doc-function">
|
|
|
|
|
|
<h6 id="omniread.PDFScraper.fetch" class="doc doc-heading">
|
|
<code class="doc-symbol doc-symbol-heading doc-symbol-method"></code> <span class="doc doc-object-name doc-function-name">fetch</span>
|
|
|
|
|
|
</h6>
|
|
<div class="language-python doc-signature highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-1">1</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1"></a><span class="nf">fetch</span><span class="p">(</span><span class="n">source</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">metadata</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Mapping</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Content</span>
|
|
</span></code></pre></div></td></tr></table></div>
|
|
|
|
<div class="doc doc-contents ">
|
|
|
|
<p>Fetch a PDF document from the given source.</p>
|
|
|
|
|
|
<p><span class="doc-section-title">Parameters:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
<th>Default</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>source</code></td>
|
|
<td>
|
|
<code><span title="typing.Any">Any</span></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Identifier of the PDF source as understood by the configured PDF client.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<em>required</em>
|
|
</td>
|
|
</tr>
|
|
<tr class="doc-section-item">
|
|
<td><code>metadata</code></td>
|
|
<td>
|
|
<code><span title="typing.Optional">Optional</span>[<span title="typing.Mapping">Mapping</span>[str, <span title="typing.Any">Any</span>]]</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Optional metadata to attach to the returned content.</p>
|
|
</div>
|
|
</td>
|
|
<td>
|
|
<code>None</code>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Returns:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th> <th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td><code>Content</code></td> <td>
|
|
<code><a class="autorefs autorefs-internal" title="omniread.core.content.Content" href="omniread/core/content/#omniread.core.content.Content">Content</a></code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>A <code>Content</code> instance containing raw PDF bytes, source identifier, PDF content type, and optional metadata.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
|
|
<p><span class="doc-section-title">Raises:</span></p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Description</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="doc-section-item">
|
|
<td>
|
|
<code>Exception</code>
|
|
</td>
|
|
<td>
|
|
<div class="doc-md-description">
|
|
<p>Retrieval-specific errors raised by the PDF client.</p>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div><ul>
|
|
<li><a href="omniread/">Omniread</a></li>
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</article>
|
|
</div>
|
|
|
|
|
|
<script>var tabs=__md_get("__tabs");if(Array.isArray(tabs))e:for(var set of document.querySelectorAll(".tabbed-set")){var labels=set.querySelector(".tabbed-labels");for(var tab of tabs)for(var label of labels.getElementsByTagName("label"))if(label.innerText.trim()===tab){var input=document.getElementById(label.htmlFor);input.checked=!0;continue e}}</script>
|
|
|
|
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
|
</div>
|
|
|
|
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8z"/></svg>
|
|
Back to top
|
|
</button>
|
|
|
|
</main>
|
|
|
|
<footer class="md-footer">
|
|
|
|
<div class="md-footer-meta md-typeset">
|
|
<div class="md-footer-meta__inner md-grid">
|
|
<div class="md-copyright">
|
|
|
|
|
|
Made with
|
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
|
Material for MkDocs
|
|
</a>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
</div>
|
|
<div class="md-dialog" data-md-component="dialog">
|
|
<div class="md-dialog__inner md-typeset"></div>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<script id="__config" type="application/json">{"base": ".", "features": ["navigation.sections", "navigation.expand", "navigation.top", "navigation.instant", "navigation.tracking", "navigation.indexes", "content.code.copy", "content.code.annotate", "content.tabs.link", "content.action.edit", "search.highlight", "search.share", "search.suggest"], "search": "assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
|
|
|
|
|
<script src="assets/javascripts/bundle.f55a23d4.min.js"></script>
|
|
|
|
|
|
</body>
|
|
</html> |