__package__ = 'archivebox.parsers' import re from typing import IO, Iterable, Optional from datetime import datetime, timezone from ..index.schema import Link from ..util import ( htmldecode, enforce_types, URL_REGEX, ) from html.parser import HTMLParser from urllib.parse import urljoin class HrefParser(HTMLParser): def __init__(self): super().__init__() self.urls = [] def handle_starttag(self, tag, attrs): if tag == "a": for attr, value in attrs: if attr == "href": self.urls.append(value) @enforce_types def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]: """Parse Generic HTML for href tags and use only the url (support for title coming later)""" html_file.seek(0) for line in html_file: parser = HrefParser() # example line #