mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-23 04:33:11 +00:00
commit
175e6fa3d0
2 changed files with 9 additions and 7 deletions
|
@ -292,7 +292,6 @@ def dedupe_links(existing_links: List[Link],
|
||||||
new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
|
new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
|
||||||
|
|
||||||
from ..parsers import parse_links
|
from ..parsers import parse_links
|
||||||
|
|
||||||
# merge existing links in out_dir and new links
|
# merge existing links in out_dir and new links
|
||||||
all_links = validate_links(existing_links + new_links)
|
all_links = validate_links(existing_links + new_links)
|
||||||
all_link_urls = {link.url for link in existing_links}
|
all_link_urls = {link.url for link in existing_links}
|
||||||
|
@ -301,6 +300,11 @@ def dedupe_links(existing_links: List[Link],
|
||||||
link for link in new_links
|
link for link in new_links
|
||||||
if link.url not in all_link_urls
|
if link.url not in all_link_urls
|
||||||
]
|
]
|
||||||
|
|
||||||
|
all_links_deduped = {link.url: link for link in all_links}
|
||||||
|
for i in range(len(new_links)):
|
||||||
|
if new_links[i].url in all_links_deduped.keys():
|
||||||
|
new_links[i] = all_links_deduped[new_links[i].url]
|
||||||
log_deduping_finished(len(new_links))
|
log_deduping_finished(len(new_links))
|
||||||
|
|
||||||
return all_links, new_links
|
return all_links, new_links
|
||||||
|
|
|
@ -520,18 +520,16 @@ def add(urls: Union[str, List[str]],
|
||||||
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||||
|
|
||||||
new_links += parse_links_from_source(write_ahead_log)
|
new_links += parse_links_from_source(write_ahead_log)
|
||||||
all_links, new_links = dedupe_links(all_links, new_links)
|
|
||||||
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
|
|
||||||
|
|
||||||
|
|
||||||
# If we're going one level deeper, download each link and look for more links
|
# If we're going one level deeper, download each link and look for more links
|
||||||
|
new_links_depth = []
|
||||||
if new_links and depth == 1:
|
if new_links and depth == 1:
|
||||||
log_crawl_started(new_links)
|
log_crawl_started(new_links)
|
||||||
for new_link in new_links:
|
for new_link in new_links:
|
||||||
downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
|
downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
|
||||||
new_links += parse_links_from_source(downloaded_file)
|
new_links_depth += parse_links_from_source(downloaded_file)
|
||||||
all_links, new_links = dedupe_links(all_links, new_links)
|
all_links, new_links = dedupe_links(all_links, new_links + new_links_depth)
|
||||||
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
|
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
|
||||||
|
|
||||||
if index_only:
|
if index_only:
|
||||||
return all_links
|
return all_links
|
||||||
|
|
Loading…
Reference in a new issue