mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 14:10:20 +00:00
fix RSS parser bailing out when lines have whitespace before tags
This commit is contained in:
parent
3571ef24e4
commit
eff0100971
2 changed files with 2 additions and 3 deletions
|
@ -59,7 +59,6 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):
|
||||||
existing_links = []
|
existing_links = []
|
||||||
if archive_path:
|
if archive_path:
|
||||||
existing_links = parse_json_links_index(archive_path)
|
existing_links = parse_json_links_index(archive_path)
|
||||||
existing_links = validate_links(existing_links)
|
|
||||||
|
|
||||||
new_links = []
|
new_links = []
|
||||||
if import_path:
|
if import_path:
|
||||||
|
@ -178,6 +177,7 @@ if __name__ == '__main__':
|
||||||
elif stdin_raw_text:
|
elif stdin_raw_text:
|
||||||
source = save_source(stdin_raw_text)
|
source = save_source(stdin_raw_text)
|
||||||
|
|
||||||
|
|
||||||
# Step 1: Parse the links and dedupe them with existing archive
|
# Step 1: Parse the links and dedupe them with existing archive
|
||||||
all_links, new_links = load_links(archive_path=out_dir, import_path=source)
|
all_links, new_links = load_links(archive_path=out_dir, import_path=source)
|
||||||
|
|
||||||
|
|
|
@ -161,7 +161,7 @@ def parse_rss_export(rss_file):
|
||||||
rows = leading_removed.split('\n')
|
rows = leading_removed.split('\n')
|
||||||
|
|
||||||
def get_row(key):
|
def get_row(key):
|
||||||
return [r for r in rows if r.startswith('<{}>'.format(key))][0]
|
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||||
|
|
||||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||||
url = str_between(get_row('link'), '<link>', '</link>')
|
url = str_between(get_row('link'), '<link>', '</link>')
|
||||||
|
@ -209,7 +209,6 @@ def parse_shaarli_rss_export(rss_file):
|
||||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||||
|
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
'url': url,
|
'url': url,
|
||||||
'domain': domain(url),
|
'domain': domain(url),
|
||||||
|
|
Loading…
Reference in a new issue