diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py
index 4bd04967..005da688 100644
--- a/archivebox/parsers/generic_rss.py
+++ b/archivebox/parsers/generic_rss.py
@@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
-from datetime import datetime
+from time import mktime
+from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
htmldecode,
- enforce_types,
- str_between,
+ enforce_types
)
@enforce_types
@@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
- items = rss_file.read().split('- ')
- items = items[1:] if items else []
- for item in items:
- # example item:
- #
-
- #
- # Unread
- # https://blog.sessionstack.com/how-javascript-works-inside
- # https://blog.sessionstack.com/how-javascript-works-inside
- # Mon, 21 Aug 2017 14:21:58 -0500
- #
+ feed = feedparser(rss_file.read())
+ for item in feed.entries:
+ url = item.link
+ title = item.title
+ time = mktime(item.updated_parsed)
- trailing_removed = item.split(' ', 1)[0]
- leading_removed = trailing_removed.split('- ', 1)[-1].strip()
- rows = leading_removed.split('\n')
+ try:
+ tags = ','.join(map(lambda tag: tag.term, item.tags))
+ except AttributeError:
+ tags = ''
- def get_row(key):
- return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
-
- url = str_between(get_row('link'), '', '')
- ts_str = str_between(get_row('pubDate'), '', '')
- time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
- title = str_between(get_row('title'), ' Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
- root = ElementTree.parse(rss_file).getroot()
- items = root.findall("{http://purl.org/rss/1.0/}item")
- for item in items:
- find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
+ feed = feedparser(rss_file.read())
+ for item in feed.entries:
+ url = item.link
+ # title will start with "[priv] " if pin was marked private. useful?
+ title = item.title
+ time = mktime(item.updated_parsed)
- url = find("{http://purl.org/rss/1.0/}link")
- tags = find("{http://purl.org/dc/elements/1.1/}subject")
- title = find("{http://purl.org/rss/1.0/}title")
- ts_str = find("{http://purl.org/dc/elements/1.1/}date")
+ # all tags are in one entry.tags with spaces in it. annoying!
+ try:
+ tags = item.tags[0].term.replace(' ', ',')
+ except AttributeError:
+ tags = ''
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
- # Pinboard includes a colon in its date stamp timezone offsets, which
- # Python can't parse. Remove it:
- if ts_str and ts_str[-3:-2] == ":":
- ts_str = ts_str[:-3]+ts_str[-2:]
-
- if ts_str:
- time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
- else:
- time = datetime.now(timezone.utc)
-
yield Link(
url=htmldecode(url),
- timestamp=str(time.timestamp()),
+ timestamp=str(time),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],
diff --git a/pyproject.toml b/pyproject.toml
index 0907858b..cb18a911 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
"dateparser>=1.0.0",
"django-extensions>=3.0.3",
"django>=3.1.3,<3.2",
+ "feedparser>=6.0.11",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",