From c1fd2cfa42f3c81d4c5a2a3c95733f4804fda938 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 3 Jan 2024 20:31:46 -0800 Subject: [PATCH] tag URLs immediately once added instead of waiting until archival completes --- archivebox/extractors/readability.py | 1 - archivebox/index/sql.py | 8 +++++--- archivebox/main.py | 30 +++++++++++++--------------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 13ee63af..dc2a06b9 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -67,7 +67,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO temp_doc.name, link.url, ] - result = run(cmd, cwd=out_dir, timeout=timeout) try: result_json = json.loads(result.stdout) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 420b9de6..5081c275 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -109,11 +109,13 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: snap = Snapshot.objects.get(url=link.url) except Snapshot.DoesNotExist: snap = write_link_to_sql_index(link) + snap.title = link.title - tag_list = list(dict.fromkeys( - tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '') - )) + tag_list = list( + {tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')} + | set(snap.tags.values_list('name', flat=True)) + ) snap.save() snap.save_tags(tag_list) diff --git a/archivebox/main.py b/archivebox/main.py index 0c98c991..76b204b8 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -604,7 +604,7 @@ def add(urls: Union[str, List[str]], out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" - from core.models import Tag + from core.models import Snapshot, Tag assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' @@ -648,6 +648,19 @@ def add(urls: Union[str, List[str]], write_main_index(links=new_links, out_dir=out_dir) all_links = load_main_index(out_dir=out_dir) + tags = [ + Tag.objects.get_or_create(name=name.strip())[0] + for name in tag.split(',') + if name.strip() + ] + if tags: + for link in imported_links: + snapshot = Snapshot.objects.get(url=link.url) + snapshot.tags.add(*tags) + snapshot.tags_str(nocache=True) + snapshot.save() + # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}') + if index_only: # mock archive all the links using the fake index_only extractor method in order to update their state if overwrite: @@ -679,21 +692,6 @@ def add(urls: Union[str, List[str]], stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green') archive_links(new_links, overwrite=False, **archive_kwargs) - - # add any tags to imported links - tags = [ - Tag.objects.get_or_create(name=name.strip())[0] - for name in tag.split(',') - if name.strip() - ] - if tags: - for link in imported_links: - snapshot = link.as_snapshot() - snapshot.tags.add(*tags) - snapshot.tags_str(nocache=True) - snapshot.save() - # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}') - if CAN_UPGRADE: hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")