mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 14:10:20 +00:00
fix wget_output_path relative location building
This commit is contained in:
parent
9c166d5bcf
commit
ac56023bfd
1 changed files with 22 additions and 21 deletions
|
@ -263,29 +263,30 @@ def wget_output_path(link):
|
||||||
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
||||||
# instead of trying to emulate it here, we just look in the output folder
|
# instead of trying to emulate it here, we just look in the output folder
|
||||||
# to see what html file wget actually created as the output
|
# to see what html file wget actually created as the output
|
||||||
url_path = without_fragment(without_query(path(link['url']))).strip('/')
|
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||||
html_parent_folder = (domain(link['url']), *url_path.rsplit('/', 1)[0].split('/'))
|
full_path = without_fragment(without_query(path(link['url']))).strip('/')
|
||||||
look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *html_parent_folder)
|
search_dir = os.path.join(
|
||||||
|
link_dir,
|
||||||
|
domain(link['url']),
|
||||||
|
full_path,
|
||||||
|
)
|
||||||
|
|
||||||
# look inside innermost path folder for an html file
|
for _ in range(4):
|
||||||
if os.path.exists(look_in):
|
if os.path.exists(search_dir):
|
||||||
html_files = [
|
if os.path.isdir(search_dir):
|
||||||
f for f in os.listdir(look_in)
|
html_files = [
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
f for f in os.listdir(search_dir)
|
||||||
]
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||||
if html_files:
|
]
|
||||||
return urlencode(os.path.join(*html_parent_folder, html_files[0]))
|
if html_files:
|
||||||
|
relative_path = search_dir.split(link_dir)[-1].strip('/')
|
||||||
|
return urlencode(os.path.join(relative_path, html_files[0]))
|
||||||
|
|
||||||
# Look one level up in case last path fragment was a file and not a folder
|
# Move up one directory level
|
||||||
look_in = look_in.rsplit('/', 1)[0]
|
search_dir = search_dir.rsplit('/', 1)[0]
|
||||||
html_parent_folder = html_parent_folder[:-1]
|
|
||||||
if os.path.exists(look_in):
|
if search_dir == link_dir:
|
||||||
html_files = [
|
break
|
||||||
f for f in os.listdir(look_in)
|
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
|
||||||
]
|
|
||||||
if html_files:
|
|
||||||
return urlencode(os.path.join(*html_parent_folder, html_files[0]))
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue