From 5e2bf73f047f2a647f1497a98aedc4cf76f12832 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 13 Jul 2020 14:48:25 -0500 Subject: [PATCH] fix: Bugs related to add() refactor --- archivebox/index/__init__.py | 6 +++++- archivebox/main.py | 10 ++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 7ea473d7..cd50a185 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -292,7 +292,6 @@ def dedupe_links(existing_links: List[Link], new_links: List[Link]) -> Tuple[List[Link], List[Link]]: from ..parsers import parse_links - # merge existing links in out_dir and new links all_links = validate_links(existing_links + new_links) all_link_urls = {link.url for link in existing_links} @@ -301,6 +300,11 @@ def dedupe_links(existing_links: List[Link], link for link in new_links if link.url not in all_link_urls ] + + all_links_deduped = {link.url: link for link in all_links} + for i in range(len(new_links)): + if new_links[i].url in all_links_deduped.keys(): + new_links[i] = all_links_deduped[new_links[i].url] log_deduping_finished(len(new_links)) return all_links, new_links diff --git a/archivebox/main.py b/archivebox/main.py index 54b71acc..999e4650 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -520,18 +520,16 @@ def add(urls: Union[str, List[str]], write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) new_links += parse_links_from_source(write_ahead_log) - all_links, new_links = dedupe_links(all_links, new_links) - write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) - # If we're going one level deeper, download each link and look for more links + new_links_depth = [] if new_links and depth == 1: log_crawl_started(new_links) for new_link in new_links: downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir) - new_links += parse_links_from_source(downloaded_file) - all_links, new_links = dedupe_links(all_links, new_links) - write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) + new_links_depth += parse_links_from_source(downloaded_file) + all_links, new_links = dedupe_links(all_links, new_links + new_links_depth) + write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) if index_only: return all_links