mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 20:23:12 +00:00
allow parsing to continue even when fetching URL contents fails
This commit is contained in:
parent
ecbcb6a1b3
commit
38e54b93fe
2 changed files with 6 additions and 3 deletions
|
@ -594,8 +594,11 @@ def add(urls: Union[str, List[str]],
|
|||
if new_links and depth == 1:
|
||||
log_crawl_started(new_links)
|
||||
for new_link in new_links:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
try:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
except Exception as err:
|
||||
stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
|
||||
|
||||
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||
|
||||
|
|
|
@ -176,7 +176,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
|||
ANSI['reset'],
|
||||
))
|
||||
print(' ', e)
|
||||
raise SystemExit(1)
|
||||
raise e
|
||||
|
||||
else:
|
||||
# Source is a path to a local file on the filesystem
|
||||
|
|
Loading…
Reference in a new issue