Add tests for generic_rss and pinboard_rss parsers

This commit is contained in:
jim winstead 2024-03-01 11:22:28 -08:00
parent 22f9a289d3
commit 1f828d9441
3 changed files with 124 additions and 0 deletions

View file

@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<feed
xml:lang="en"
xmlns="http://www.w3.org/2005/Atom"
>
<id>http://www.example.com/</id>
<title>Example of an Atom feed</title>
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
<link rel="alternate" type="text/html" href="http://www.example.com/" />
<author>
<name>Jim Winstead</name>
</author>
<updated>2024-02-26T03:18:26Z</updated>
<entry>
<title>Example</title>
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
<id>tag:example.com,2024-02-25:3319</id>
<updated>2024-02-26T03:18:26Z</updated>
<published>2024-02-25T19:18:25-08:00</published>
<category term="Tag1" scheme="http://example.com/archive" />
<category term="Tag2" scheme="http://example.com/archive" />
<content type="html">This is some &lt;b&gt;content&lt;/b&gt;</content>
</entry>
</feed>

View file

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<channel>
<title>Sample Feed</title>
<link>http://example.org/</link>
<description>For documentation only</description>
<dc:language>en-us</dc:language>
<dc:creator>Nobody (nobody@example.org)</dc:creator>
<dc:rights>Public domain</dc:rights>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
<item>
<title>First!</title>
<link>http://127.0.0.1:8080/static/example.com.html</link>
<guid isPermaLink="false">just-an@example.org</guid>
<description>
This has a description.
</description>
<dc:subject>Tag1 Tag2</dc:subject>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<content:encoded><![CDATA[
This has a <b>description</b>.]]>
</content:encoded>
</item>
</channel>
</rss>

View file

@ -91,3 +91,71 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
assert (archived_item_path / "warc").exists()
assert not (archived_item_path / "singlefile.html").exists()
def test_generic_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://purl.org/dc/elements/1.1/" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1 Tag2" in tags
def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_atom(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.w3.org/2005/Atom" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags