ArchiveBox/archivebox/parsers/pinboard_rss.py

__package__ = 'archivebox.parsers'


from typing import IO, Iterable
from datetime import datetime, timezone

from xml.etree import ElementTree

from ..index.schema import Link
from ..util import (
    htmldecode,
    enforce_types,
)


@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse Pinboard RSS feed files into links"""

    rss_file.seek(0)
    root = ElementTree.parse(rss_file).getroot()
    items = root.findall("{http://purl.org/rss/1.0/}item")
    for item in items:
        find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None    # type: ignore

        url = find("{http://purl.org/rss/1.0/}link")
        tags = find("{http://purl.org/dc/elements/1.1/}subject")
        title = find("{http://purl.org/rss/1.0/}title")
        ts_str = find("{http://purl.org/dc/elements/1.1/}date")
        
        if url is None:
            # Yielding a Link with no URL will
            # crash on a URL validation assertion
            continue

        # Pinboard includes a colon in its date stamp timezone offsets, which
        # Python can't parse. Remove it:
        if ts_str and ts_str[-3:-2] == ":":
            ts_str = ts_str[:-3]+ts_str[-2:]

        if ts_str:
            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
        else:
            time = datetime.now(timezone.utc)

        yield Link(
            url=htmldecode(url),
            timestamp=str(time.timestamp()),
            title=htmldecode(title) or None,
            tags=htmldecode(tags) or None,
            sources=[rss_file.name],
        )


KEY = 'pinboard_rss'
NAME = 'Pinboard RSS'
PARSER = parse_pinboard_rss_export
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`__package__ = 'archivebox.parsers'`


			`from typing import IO, Iterable`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`from datetime import datetime, timezone`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`from xml.etree import ElementTree`

			`from ..index.schema import Link`
			`from ..util import (`
			`htmldecode,`
			`enforce_types,`
			`)`


			`@enforce_types`
make all parsers accept arbitrary meta kwargs 2020-08-18 12:27:47 +00:00			`def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`"""Parse Pinboard RSS feed files into links"""`

			`rss_file.seek(0)`
			`root = ElementTree.parse(rss_file).getroot()`
			`items = root.findall("{http://purl.org/rss/1.0/}item")`
			`for item in items:`
Fix Pinboard RSS parsing valid links as `None` `item.find(p)` returns either an `ElementTree.Element` or `None`. The [lambda on line 24][lambda] coerces the return value to a bool, which is `False` if the `<link>` element has no children (see [`ElementTree.py` line 207][etbooldef]), so the lambda returns `None`. Further, returning a `Link` with `url=None` violates [an assertion in `index/schema.py`][assertion], which crashes the `archivebox add` command. [lambda]: https://github.com/ArchiveBox/ArchiveBox/blob/3d54b1321bf8c56627aaa50efcc809cd99caee52/archivebox/parsers/pinboard_rss.py#L24 [etbooldef]: https://github.com/python/cpython/blob/3d8993a744813c5144851da5347d7b4b1885f234/Lib/xml/etree/ElementTree.py#L207 [assertion]: https://github.com/ArchiveBox/ArchiveBox/blob/3d54b1321bf8c56627aaa50efcc809cd99caee52/archivebox/index/schema.py#L165 2021-08-04 13:26:51 +00:00			`find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`url = find("{http://purl.org/rss/1.0/}link")`
			`tags = find("{http://purl.org/dc/elements/1.1/}subject")`
			`title = find("{http://purl.org/rss/1.0/}title")`
			`ts_str = find("{http://purl.org/dc/elements/1.1/}date")`

Fix Pinboard RSS parsing valid links as `None` `item.find(p)` returns either an `ElementTree.Element` or `None`. The [lambda on line 24][lambda] coerces the return value to a bool, which is `False` if the `<link>` element has no children (see [`ElementTree.py` line 207][etbooldef]), so the lambda returns `None`. Further, returning a `Link` with `url=None` violates [an assertion in `index/schema.py`][assertion], which crashes the `archivebox add` command. [lambda]: https://github.com/ArchiveBox/ArchiveBox/blob/3d54b1321bf8c56627aaa50efcc809cd99caee52/archivebox/parsers/pinboard_rss.py#L24 [etbooldef]: https://github.com/python/cpython/blob/3d8993a744813c5144851da5347d7b4b1885f234/Lib/xml/etree/ElementTree.py#L207 [assertion]: https://github.com/ArchiveBox/ArchiveBox/blob/3d54b1321bf8c56627aaa50efcc809cd99caee52/archivebox/index/schema.py#L165 2021-08-04 13:26:51 +00:00			`if url is None:`
			`# Yielding a Link with no URL will`
			`# crash on a URL validation assertion`
			`continue`

move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`# Pinboard includes a colon in its date stamp timezone offsets, which`
			`# Python can't parse. Remove it:`
			`if ts_str and ts_str[-3:-2] == ":":`
			`ts_str = ts_str[:-3]+ts_str[-2:]`

			`if ts_str:`
			`time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")`
			`else:`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`time = datetime.now(timezone.utc)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`yield Link(`
			`url=htmldecode(url),`
			`timestamp=str(time.timestamp()),`
			`title=htmldecode(title) or None,`
			`tags=htmldecode(tags) or None,`
			`sources=[rss_file.name],`
			`)`
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init 2021-03-31 05:05:49 +00:00

			`KEY = 'pinboard_rss'`
			`NAME = 'Pinboard RSS'`
			`PARSER = parse_pinboard_rss_export`