mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 13:28:29 +00:00
Add tests for generic_rss and pinboard_rss parsers
This commit is contained in:
parent
22f9a289d3
commit
1f828d9441
3 changed files with 124 additions and 0 deletions
24
tests/mock_server/templates/example.atom
Normal file
24
tests/mock_server/templates/example.atom
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<feed
|
||||||
|
xml:lang="en"
|
||||||
|
xmlns="http://www.w3.org/2005/Atom"
|
||||||
|
>
|
||||||
|
<id>http://www.example.com/</id>
|
||||||
|
<title>Example of an Atom feed</title>
|
||||||
|
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
|
||||||
|
<link rel="alternate" type="text/html" href="http://www.example.com/" />
|
||||||
|
<author>
|
||||||
|
<name>Jim Winstead</name>
|
||||||
|
</author>
|
||||||
|
<updated>2024-02-26T03:18:26Z</updated>
|
||||||
|
<entry>
|
||||||
|
<title>Example</title>
|
||||||
|
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
|
||||||
|
<id>tag:example.com,2024-02-25:3319</id>
|
||||||
|
<updated>2024-02-26T03:18:26Z</updated>
|
||||||
|
<published>2024-02-25T19:18:25-08:00</published>
|
||||||
|
<category term="Tag1" scheme="http://example.com/archive" />
|
||||||
|
<category term="Tag2" scheme="http://example.com/archive" />
|
||||||
|
<content type="html">This is some <b>content</b></content>
|
||||||
|
</entry>
|
||||||
|
</feed>
|
32
tests/mock_server/templates/example.rss
Normal file
32
tests/mock_server/templates/example.rss
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<rss version="2.0"
|
||||||
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:admin="http://webns.net/mvcb/"
|
||||||
|
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||||
|
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||||
|
<channel>
|
||||||
|
<title>Sample Feed</title>
|
||||||
|
<link>http://example.org/</link>
|
||||||
|
<description>For documentation only</description>
|
||||||
|
<dc:language>en-us</dc:language>
|
||||||
|
<dc:creator>Nobody (nobody@example.org)</dc:creator>
|
||||||
|
<dc:rights>Public domain</dc:rights>
|
||||||
|
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
|
||||||
|
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
|
||||||
|
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
|
||||||
|
|
||||||
|
<item>
|
||||||
|
<title>First!</title>
|
||||||
|
<link>http://127.0.0.1:8080/static/example.com.html</link>
|
||||||
|
<guid isPermaLink="false">just-an@example.org</guid>
|
||||||
|
<description>
|
||||||
|
This has a description.
|
||||||
|
</description>
|
||||||
|
<dc:subject>Tag1 Tag2</dc:subject>
|
||||||
|
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
|
||||||
|
<content:encoded><![CDATA[
|
||||||
|
This has a <b>description</b>.]]>
|
||||||
|
</content:encoded>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
|
@ -91,3 +91,71 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
|
||||||
|
|
||||||
assert (archived_item_path / "warc").exists()
|
assert (archived_item_path / "warc").exists()
|
||||||
assert not (archived_item_path / "singlefile.html").exists()
|
assert not (archived_item_path / "singlefile.html").exists()
|
||||||
|
|
||||||
|
def test_generic_rss(tmp_path, process, disable_extractors_dict):
|
||||||
|
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
|
||||||
|
arg_process = subprocess.run(
|
||||||
|
["archivebox", "add", "--index-only", "--parser=rss"],
|
||||||
|
stdin=f,
|
||||||
|
capture_output=True,
|
||||||
|
env=disable_extractors_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
conn = sqlite3.connect("index.sqlite3")
|
||||||
|
c = conn.cursor()
|
||||||
|
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||||
|
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
urls = list(map(lambda x: x[0], urls))
|
||||||
|
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||||
|
# if the following URL appears, we must have fallen back to another parser
|
||||||
|
assert not "http://purl.org/dc/elements/1.1/" in urls
|
||||||
|
|
||||||
|
tags = list(map(lambda x: x[0], tags))
|
||||||
|
assert "Tag1 Tag2" in tags
|
||||||
|
|
||||||
|
def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
|
||||||
|
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
|
||||||
|
arg_process = subprocess.run(
|
||||||
|
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
|
||||||
|
stdin=f,
|
||||||
|
capture_output=True,
|
||||||
|
env=disable_extractors_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
conn = sqlite3.connect("index.sqlite3")
|
||||||
|
c = conn.cursor()
|
||||||
|
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
tags = list(map(lambda x: x[0], tags))
|
||||||
|
assert "Tag1" in tags
|
||||||
|
assert "Tag2" in tags
|
||||||
|
|
||||||
|
def test_atom(tmp_path, process, disable_extractors_dict):
|
||||||
|
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
|
||||||
|
arg_process = subprocess.run(
|
||||||
|
["archivebox", "add", "--index-only", "--parser=rss"],
|
||||||
|
stdin=f,
|
||||||
|
capture_output=True,
|
||||||
|
env=disable_extractors_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
conn = sqlite3.connect("index.sqlite3")
|
||||||
|
c = conn.cursor()
|
||||||
|
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||||
|
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
urls = list(map(lambda x: x[0], urls))
|
||||||
|
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||||
|
# if the following URL appears, we must have fallen back to another parser
|
||||||
|
assert not "http://www.w3.org/2005/Atom" in urls
|
||||||
|
|
||||||
|
tags = list(map(lambda x: x[0], tags))
|
||||||
|
assert "Tag1" in tags
|
||||||
|
assert "Tag2" in tags
|
||||||
|
|
Loading…
Add table
Reference in a new issue