mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 20:23:12 +00:00
disable wget --mirror in favor of timestamping
This commit is contained in:
parent
0551e5b9e0
commit
e09c704d50
1 changed files with 4 additions and 2 deletions
|
@ -168,7 +168,7 @@ def fetch_wget(out_dir, link, overwrite=False):
|
||||||
if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite:
|
if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite:
|
||||||
print(' - Downloading Full Site')
|
print(' - Downloading Full Site')
|
||||||
CMD = [
|
CMD = [
|
||||||
*'wget --mirror --adjust-extension --convert-links --no-parent'.split(' '),
|
*'wget --timestamping --adjust-extension --convert-links --no-parent'.split(' '),
|
||||||
*(('--page-requisites',) if FETCH_WGET_IMAGES else ()),
|
*(('--page-requisites',) if FETCH_WGET_IMAGES else ()),
|
||||||
link['url'],
|
link['url'],
|
||||||
]
|
]
|
||||||
|
@ -225,8 +225,9 @@ def archive_dot_org(out_dir, link, overwrite=False):
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
|
submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
|
||||||
|
|
||||||
success = False
|
success = False
|
||||||
|
CMD = ['curl', '-I', submit_url]
|
||||||
try:
|
try:
|
||||||
result = run(['curl', '-I', submit_url], stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=TIMEOUT) # archive.org
|
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=TIMEOUT) # archive.org
|
||||||
headers = result.stdout.splitlines()
|
headers = result.stdout.splitlines()
|
||||||
content_location = [h for h in headers if b'Content-Location: ' in h]
|
content_location = [h for h in headers if b'Content-Location: ' in h]
|
||||||
if content_location:
|
if content_location:
|
||||||
|
@ -234,6 +235,7 @@ def archive_dot_org(out_dir, link, overwrite=False):
|
||||||
saved_url = 'https://web.archive.org{}'.format(archive_path)
|
saved_url = 'https://web.archive.org{}'.format(archive_path)
|
||||||
success = True
|
success = True
|
||||||
else:
|
else:
|
||||||
|
print(' Visit url to see output:', ' '.join(CMD))
|
||||||
raise Exception('Failed to find Content-Location URL in Archive.org response headers.')
|
raise Exception('Failed to find Content-Location URL in Archive.org response headers.')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(' Exception: {} {}'.format(e.__class__.__name__, e))
|
print(' Exception: {} {}'.format(e.__class__.__name__, e))
|
||||||
|
|
Loading…
Reference in a new issue