diff --git a/archivebox/archive.py b/archivebox/archive.py index b4292649..e5056cf9 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -59,7 +59,6 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None): existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) - existing_links = validate_links(existing_links) new_links = [] if import_path: @@ -178,6 +177,7 @@ if __name__ == '__main__': elif stdin_raw_text: source = save_source(stdin_raw_text) + # Step 1: Parse the links and dedupe them with existing archive all_links, new_links = load_links(archive_path=out_dir, import_path=source) diff --git a/archivebox/parse.py b/archivebox/parse.py index 0ce55128..89ac2f94 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -161,7 +161,7 @@ def parse_rss_export(rss_file): rows = leading_removed.split('\n') def get_row(key): - return [r for r in rows if r.startswith('<{}>'.format(key))][0] + return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] title = str_between(get_row('title'), '', '') @@ -209,7 +209,6 @@ def parse_shaarli_rss_export(rss_file): ts_str = str_between(get_row('published'), '', '') time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - info = { 'url': url, 'domain': domain(url),