allow parsing to continue even when fetching URL contents fails

2024-11-26 14:10:20 +00:00 · 2022-05-09 19:56:24 -07:00 · 2022-05-09 19:56:24 -07:00 · 38e54b93fe
commit 38e54b93fe
parent ecbcb6a1b3
2 changed files with 6 additions and 3 deletions
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -594,8 +594,11 @@ def add(urls: Union[str, List[str]],
    if new_links and depth == 1:
        log_crawl_started(new_links)
        for new_link in new_links:
+            try:
                downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
                new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
+            except Exception as err:
+                stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')

    imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
    
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -176,7 +176,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
                ANSI['reset'],
            ))
            print('    ', e)
-            raise SystemExit(1)
+            raise e

    else:
        # Source is a path to a local file on the filesystem