mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 19:53:06 +00:00
Use feedparser for RSS parsing (#1362)
Fixes #1171 Fixes #870 (probably, would need to test against a Wallabag Atom file to Fixes #135 Fixes #123 Fixes #106
This commit is contained in:
commit
099f7d00fe
6 changed files with 158 additions and 50 deletions
|
@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
|
|||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
enforce_types
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
|
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
"""Parse RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
items = rss_file.read().split('<item>')
|
||||
items = items[1:] if items else []
|
||||
for item in items:
|
||||
# example item:
|
||||
# <item>
|
||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
||||
# <category>Unread</category>
|
||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
||||
# </item>
|
||||
feed = feedparser(rss_file.read())
|
||||
for item in feed.entries:
|
||||
url = item.link
|
||||
title = item.title
|
||||
time = mktime(item.updated_parsed)
|
||||
|
||||
trailing_removed = item.split('</item>', 1)[0]
|
||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||
rows = leading_removed.split('\n')
|
||||
try:
|
||||
tags = ','.join(map(lambda tag: tag.term, item.tags))
|
||||
except AttributeError:
|
||||
tags = ''
|
||||
|
||||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||
if url is None:
|
||||
# Yielding a Link with no URL will
|
||||
# crash on a URL validation assertion
|
||||
continue
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
timestamp=str(time),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
tags=tags,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
|
|
@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
|
|||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from xml.etree import ElementTree
|
||||
from time import mktime
|
||||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
enforce_types
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
|
||||
feed = feedparser(rss_file.read())
|
||||
for item in feed.entries:
|
||||
url = item.link
|
||||
# title will start with "[priv] " if pin was marked private. useful?
|
||||
title = item.title
|
||||
time = mktime(item.updated_parsed)
|
||||
|
||||
url = find("{http://purl.org/rss/1.0/}link")
|
||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
||||
title = find("{http://purl.org/rss/1.0/}title")
|
||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
||||
# all tags are in one entry.tags with spaces in it. annoying!
|
||||
try:
|
||||
tags = item.tags[0].term.replace(' ', ',')
|
||||
except AttributeError:
|
||||
tags = ''
|
||||
|
||||
if url is None:
|
||||
# Yielding a Link with no URL will
|
||||
# crash on a URL validation assertion
|
||||
continue
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now(timezone.utc)
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
timestamp=str(time),
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
|
|
|
@ -15,6 +15,7 @@ dependencies = [
|
|||
"dateparser>=1.0.0",
|
||||
"django-extensions>=3.0.3",
|
||||
"django>=3.1.3,<3.2",
|
||||
"feedparser>=6.0.11",
|
||||
"ipython>5.0.0",
|
||||
"mypy-extensions>=0.4.3",
|
||||
"python-crontab>=2.5.1",
|
||||
|
|
24
tests/mock_server/templates/example.atom
Normal file
24
tests/mock_server/templates/example.atom
Normal file
|
@ -0,0 +1,24 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed
|
||||
xml:lang="en"
|
||||
xmlns="http://www.w3.org/2005/Atom"
|
||||
>
|
||||
<id>http://www.example.com/</id>
|
||||
<title>Example of an Atom feed</title>
|
||||
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
|
||||
<link rel="alternate" type="text/html" href="http://www.example.com/" />
|
||||
<author>
|
||||
<name>Jim Winstead</name>
|
||||
</author>
|
||||
<updated>2024-02-26T03:18:26Z</updated>
|
||||
<entry>
|
||||
<title>Example</title>
|
||||
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
|
||||
<id>tag:example.com,2024-02-25:3319</id>
|
||||
<updated>2024-02-26T03:18:26Z</updated>
|
||||
<published>2024-02-25T19:18:25-08:00</published>
|
||||
<category term="Tag1" scheme="http://example.com/archive" />
|
||||
<category term="Tag2" scheme="http://example.com/archive" />
|
||||
<content type="html">This is some <b>content</b></content>
|
||||
</entry>
|
||||
</feed>
|
32
tests/mock_server/templates/example.rss
Normal file
32
tests/mock_server/templates/example.rss
Normal file
|
@ -0,0 +1,32 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:admin="http://webns.net/mvcb/"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<channel>
|
||||
<title>Sample Feed</title>
|
||||
<link>http://example.org/</link>
|
||||
<description>For documentation only</description>
|
||||
<dc:language>en-us</dc:language>
|
||||
<dc:creator>Nobody (nobody@example.org)</dc:creator>
|
||||
<dc:rights>Public domain</dc:rights>
|
||||
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
|
||||
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
|
||||
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
|
||||
|
||||
<item>
|
||||
<title>First!</title>
|
||||
<link>http://127.0.0.1:8080/static/example.com.html</link>
|
||||
<guid isPermaLink="false">just-an@example.org</guid>
|
||||
<description>
|
||||
This has a description.
|
||||
</description>
|
||||
<dc:subject>Tag1 Tag2</dc:subject>
|
||||
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
|
||||
<content:encoded><![CDATA[
|
||||
This has a <b>description</b>.]]>
|
||||
</content:encoded>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
|
@ -148,3 +148,71 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
|
|||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
||||
def test_generic_rss(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=rss"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://purl.org/dc/elements/1.1/" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1 Tag2" in tags
|
||||
|
||||
def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
||||
def test_atom(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=rss"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.w3.org/2005/Atom" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
|
Loading…
Reference in a new issue