ArchiveBox/archivebox/parsers/generic_html.py

__package__ = 'archivebox.parsers'


import re

from typing import IO, Iterable, Optional
from datetime import datetime

from ..index.schema import Link
from ..util import (
    htmldecode,
    enforce_types,
    URL_REGEX,
)
from html.parser import HTMLParser
from urllib.parse import urljoin


class HrefParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.urls = []

    def handle_starttag(self, tag, attrs):
        if tag == "a":
            for attr, value in attrs:
                if attr == "href":
                    self.urls.append(value)


@enforce_types
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
    """Parse Generic HTML for href tags and use only the url (support for title coming later)"""

    html_file.seek(0)
    for line in html_file:
        parser = HrefParser()
        # example line
        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
        parser.feed(line)
        for url in parser.urls:
            if root_url:
                # resolve relative urls /home.html -> https://example.com/home.html
                url = urljoin(root_url, url)
            
            for archivable_url in re.findall(URL_REGEX, url):
                yield Link(
                    url=htmldecode(archivable_url),
                    timestamp=str(datetime.now().timestamp()),
                    title=None,
                    tags=None,
                    sources=[html_file.name],
                )
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`__package__ = 'archivebox.parsers'`


			`import re`

			`from typing import IO, Iterable, Optional`
			`from datetime import datetime`

			`from ..index.schema import Link`
			`from ..util import (`
			`htmldecode,`
			`enforce_types,`
			`URL_REGEX,`
			`)`
			`from html.parser import HTMLParser`
			`from urllib.parse import urljoin`


			`class HrefParser(HTMLParser):`
			`def __init__(self):`
			`super().__init__()`
			`self.urls = []`

			`def handle_starttag(self, tag, attrs):`
			`if tag == "a":`
			`for attr, value in attrs:`
			`if attr == "href":`
			`self.urls.append(value)`


			`@enforce_types`
			`def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:`
fix parser docstring 2020-08-18 13:20:05 +00:00			`"""Parse Generic HTML for href tags and use only the url (support for title coming later)"""`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00
			`html_file.seek(0)`
			`for line in html_file:`
			`parser = HrefParser()`
			`# example line`
			`# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>`
			`parser.feed(line)`
			`for url in parser.urls:`
			`if root_url:`
			`# resolve relative urls /home.html -> https://example.com/home.html`
			`url = urljoin(root_url, url)`

			`for archivable_url in re.findall(URL_REGEX, url):`
			`yield Link(`
			`url=htmldecode(archivable_url),`
			`timestamp=str(datetime.now().timestamp()),`
			`title=None,`
			`tags=None,`
			`sources=[html_file.name],`
			`)`