mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
add overwrite flag to add command to force re-archiving
This commit is contained in:
parent
da671532a4
commit
b681a477ae
4 changed files with 35 additions and 18 deletions
|
@ -55,6 +55,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
type=int,
|
||||
help="Recursively archive all linked pages up to this many hops away"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Re-archive URLs from scratch, overwriting any existing files"
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
urls = command.urls
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
|
@ -69,6 +75,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
depth=command.depth,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
|
|
@ -36,18 +36,18 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
|||
|
||||
def get_default_archive_methods():
|
||||
return [
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('singlefile', should_save_singlefile, save_singlefile),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
]
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('singlefile', should_save_singlefile, save_singlefile),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
]
|
||||
|
||||
@enforce_types
|
||||
def ignore_methods(to_ignore: List[str]):
|
||||
|
|
|
@ -522,6 +522,7 @@ def add(urls: Union[str, List[str]],
|
|||
depth: int=0,
|
||||
update_all: bool=not ONLY_NEW,
|
||||
index_only: bool=False,
|
||||
overwrite: bool=False,
|
||||
out_dir: str=OUTPUT_DIR) -> List[Link]:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
|
@ -551,20 +552,28 @@ def add(urls: Union[str, List[str]],
|
|||
for new_link in new_links:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file)
|
||||
all_links, new_links = dedupe_links(all_links, new_links + new_links_depth)
|
||||
|
||||
imported_links = new_links + new_links_depth
|
||||
all_links, new_links = dedupe_links(all_links, imported_links)
|
||||
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
|
||||
|
||||
if index_only:
|
||||
return all_links
|
||||
|
||||
# Run the archive methods for each link
|
||||
to_archive = all_links if update_all else new_links
|
||||
archive_links(to_archive, out_dir=out_dir)
|
||||
if update_all:
|
||||
archive_links(all_links, overwrite=overwrite, out_dir=out_dir)
|
||||
elif overwrite:
|
||||
archive_links(imported_links, overwrite=True, out_dir=out_dir)
|
||||
elif new_links:
|
||||
archive_links(new_links, overwrite=False, out_dir=out_dir)
|
||||
else:
|
||||
# nothing was updated, don't bother re-saving the index
|
||||
return all_links
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
if to_archive:
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
|
||||
return all_links
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -16,6 +16,7 @@ from .util import enforce_types, ExtendedEncoder
|
|||
from .config import OUTPUT_PERMISSIONS
|
||||
|
||||
|
||||
|
||||
def run(*args, input=None, capture_output=True, text=False, **kwargs):
|
||||
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
||||
|
||||
|
|
Loading…
Reference in a new issue