mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
feat: Update update
command to work with querysets
This commit is contained in:
parent
dafa1dd63c
commit
f55153eab3
4 changed files with 84 additions and 56 deletions
|
@ -392,45 +392,50 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
|||
return snapshots.filter(q_filter)
|
||||
|
||||
|
||||
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_indexed_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
}
|
||||
|
||||
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_archived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
}
|
||||
|
||||
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_unarchived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
}
|
||||
|
||||
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_present_folders(_snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that actually exist in the archive/ folder"""
|
||||
|
||||
all_folders = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
|
||||
if entry.is_dir():
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_folders[entry.path] = link
|
||||
all_folders[entry.name] = link
|
||||
|
||||
return all_folders
|
||||
|
||||
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_valid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_valid, links)
|
||||
|
|
|
@ -29,22 +29,28 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) ->
|
|||
with transaction.atomic():
|
||||
snapshots.delete()
|
||||
|
||||
@enforce_types
|
||||
def write_link_to_sql_index(link: Link):
|
||||
from core.models import Snapshot
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
try:
|
||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
||||
except Snapshot.DoesNotExist:
|
||||
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||
|
||||
return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0]
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
from django.db import transaction
|
||||
|
||||
with transaction.atomic():
|
||||
for link in links:
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
try:
|
||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
||||
except Snapshot.DoesNotExist:
|
||||
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||
|
||||
Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||
write_link_to_sql_index(link)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||
|
@ -53,7 +59,10 @@ def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
|||
from django.db import transaction
|
||||
|
||||
with transaction.atomic():
|
||||
snap = Snapshot.objects.get(url=link.url)
|
||||
try:
|
||||
snap = Snapshot.objects.get(url=link.url)
|
||||
except Snapshot.DoesNotExist:
|
||||
snap = write_link_to_sql_index(link)
|
||||
snap.title = link.title
|
||||
snap.tags = link.tags
|
||||
snap.save()
|
||||
|
|
|
@ -659,24 +659,18 @@ def update(resume: Optional[float]=None,
|
|||
|
||||
check_data_folder(out_dir=out_dir)
|
||||
check_dependencies()
|
||||
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||
|
||||
# Step 1: Load list of links from the existing index
|
||||
# merge in and dedupe new links from import_path
|
||||
new_links: List[Link] = []
|
||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
||||
|
||||
# Step 2: Write updated index with deduped old and new links back to disk
|
||||
# write_main_index(links=list(all_links), out_dir=out_dir)
|
||||
|
||||
# Step 3: Filter for selected_links
|
||||
matching_links = list_links(
|
||||
# Step 1: Filter for selected_links
|
||||
matching_snapshots = list_links(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
)
|
||||
|
||||
matching_folders = list_folders(
|
||||
links=list(matching_links),
|
||||
links=matching_snapshots,
|
||||
status=status,
|
||||
out_dir=out_dir,
|
||||
)
|
||||
|
@ -685,7 +679,7 @@ def update(resume: Optional[float]=None,
|
|||
if index_only:
|
||||
return all_links
|
||||
|
||||
# Step 3: Run the archive methods for each link
|
||||
# Step 2: Run the archive methods for each link
|
||||
to_archive = new_links if only_new else all_links
|
||||
if resume:
|
||||
to_archive = [
|
||||
|
@ -700,8 +694,8 @@ def update(resume: Optional[float]=None,
|
|||
archive_links(to_archive, overwrite=overwrite, out_dir=out_dir)
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
||||
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
write_static_index([link.as_link() for link in all_links], out_dir=out_dir)
|
||||
return all_links
|
||||
|
||||
@enforce_types
|
||||
|
@ -743,7 +737,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
|||
# snapshots = sorted(links, key=lambda link: getattr(link, sort))
|
||||
|
||||
folders = list_folders(
|
||||
links=[snapshot.as_link() for snapshot in snapshots],
|
||||
links=snapshots,
|
||||
status=status,
|
||||
out_dir=out_dir,
|
||||
)
|
||||
|
@ -782,30 +776,23 @@ def list_folders(links: List[Link],
|
|||
|
||||
check_data_folder(out_dir=out_dir)
|
||||
|
||||
if status == 'indexed':
|
||||
return get_indexed_folders(links, out_dir=out_dir)
|
||||
elif status == 'archived':
|
||||
return get_archived_folders(links, out_dir=out_dir)
|
||||
elif status == 'unarchived':
|
||||
return get_unarchived_folders(links, out_dir=out_dir)
|
||||
STATUS_FUNCTIONS = {
|
||||
"indexed": get_indexed_folders,
|
||||
"archived": get_archived_folders,
|
||||
"unarchived": get_unarchived_folders,
|
||||
"present": get_present_folders,
|
||||
"valid": get_valid_folders,
|
||||
"invalid": get_invalid_folders,
|
||||
"duplicate": get_duplicate_folders,
|
||||
"orphaned": get_orphaned_folders,
|
||||
"corrupted": get_corrupted_folders,
|
||||
"unrecognized": get_unrecognized_folders,
|
||||
}
|
||||
|
||||
elif status == 'present':
|
||||
return get_present_folders(links, out_dir=out_dir)
|
||||
elif status == 'valid':
|
||||
return get_valid_folders(links, out_dir=out_dir)
|
||||
elif status == 'invalid':
|
||||
return get_invalid_folders(links, out_dir=out_dir)
|
||||
|
||||
elif status == 'duplicate':
|
||||
return get_duplicate_folders(links, out_dir=out_dir)
|
||||
elif status == 'orphaned':
|
||||
return get_orphaned_folders(links, out_dir=out_dir)
|
||||
elif status == 'corrupted':
|
||||
return get_corrupted_folders(links, out_dir=out_dir)
|
||||
elif status == 'unrecognized':
|
||||
return get_unrecognized_folders(links, out_dir=out_dir)
|
||||
|
||||
raise ValueError('Status not recognized.')
|
||||
try:
|
||||
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
|
||||
except KeyError:
|
||||
raise ValueError('Status not recognized.')
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
27
tests/test_update.py
Normal file
27
tests/test_update.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
|
||||
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
|
||||
assert list((tmp_path / "archive").iterdir()) != []
|
||||
|
||||
subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
||||
c = conn.cursor()
|
||||
link = c.execute("SELECT * FROM core_snapshot").fetchone()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
assert link is None
|
||||
|
||||
update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
||||
c = conn.cursor()
|
||||
url = c.execute("SELECT * FROM core_snapshot").fetchone()[1]
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
assert url == 'http://127.0.0.1:8080/static/example.com.html'
|
Loading…
Reference in a new issue