feat: Update update command to work with querysets

This commit is contained in:
Cristian 2020-08-22 08:59:25 -05:00 committed by Cristian Vargas
parent dafa1dd63c
commit f55153eab3
4 changed files with 84 additions and 56 deletions

View file

@ -392,45 +392,50 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
return snapshots.filter(q_filter)
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_indexed_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in links
}
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_archived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_archived, links)
}
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_unarchived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_unarchived, links)
}
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_present_folders(_snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that actually exist in the archive/ folder"""
all_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir():
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
all_folders[entry.path] = link
all_folders[entry.name] = link
return all_folders
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_valid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content"""
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_valid, links)

View file

@ -29,22 +29,28 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) ->
with transaction.atomic():
snapshots.delete()
@enforce_types
def write_link_to_sql_index(link: Link):
from core.models import Snapshot
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0]
@enforce_types
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True)
from core.models import Snapshot
from django.db import transaction
with transaction.atomic():
for link in links:
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
Snapshot.objects.update_or_create(url=link.url, defaults=info)
write_link_to_sql_index(link)
@enforce_types
def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
@ -53,7 +59,10 @@ def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
from django.db import transaction
with transaction.atomic():
snap = Snapshot.objects.get(url=link.url)
try:
snap = Snapshot.objects.get(url=link.url)
except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link)
snap.title = link.title
snap.tags = link.tags
snap.save()

View file

@ -659,24 +659,18 @@ def update(resume: Optional[float]=None,
check_data_folder(out_dir=out_dir)
check_dependencies()
new_links: List[Link] = [] # TODO: Remove input argument: only_new
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
new_links: List[Link] = []
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
# Step 2: Write updated index with deduped old and new links back to disk
# write_main_index(links=list(all_links), out_dir=out_dir)
# Step 3: Filter for selected_links
matching_links = list_links(
# Step 1: Filter for selected_links
matching_snapshots = list_links(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
)
matching_folders = list_folders(
links=list(matching_links),
links=matching_snapshots,
status=status,
out_dir=out_dir,
)
@ -685,7 +679,7 @@ def update(resume: Optional[float]=None,
if index_only:
return all_links
# Step 3: Run the archive methods for each link
# Step 2: Run the archive methods for each link
to_archive = new_links if only_new else all_links
if resume:
to_archive = [
@ -700,8 +694,8 @@ def update(resume: Optional[float]=None,
archive_links(to_archive, overwrite=overwrite, out_dir=out_dir)
# Step 4: Re-write links index with updated titles, icons, and resources
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
all_links = load_main_index(out_dir=out_dir)
write_static_index([link.as_link() for link in all_links], out_dir=out_dir)
return all_links
@enforce_types
@ -743,7 +737,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
# snapshots = sorted(links, key=lambda link: getattr(link, sort))
folders = list_folders(
links=[snapshot.as_link() for snapshot in snapshots],
links=snapshots,
status=status,
out_dir=out_dir,
)
@ -782,30 +776,23 @@ def list_folders(links: List[Link],
check_data_folder(out_dir=out_dir)
if status == 'indexed':
return get_indexed_folders(links, out_dir=out_dir)
elif status == 'archived':
return get_archived_folders(links, out_dir=out_dir)
elif status == 'unarchived':
return get_unarchived_folders(links, out_dir=out_dir)
STATUS_FUNCTIONS = {
"indexed": get_indexed_folders,
"archived": get_archived_folders,
"unarchived": get_unarchived_folders,
"present": get_present_folders,
"valid": get_valid_folders,
"invalid": get_invalid_folders,
"duplicate": get_duplicate_folders,
"orphaned": get_orphaned_folders,
"corrupted": get_corrupted_folders,
"unrecognized": get_unrecognized_folders,
}
elif status == 'present':
return get_present_folders(links, out_dir=out_dir)
elif status == 'valid':
return get_valid_folders(links, out_dir=out_dir)
elif status == 'invalid':
return get_invalid_folders(links, out_dir=out_dir)
elif status == 'duplicate':
return get_duplicate_folders(links, out_dir=out_dir)
elif status == 'orphaned':
return get_orphaned_folders(links, out_dir=out_dir)
elif status == 'corrupted':
return get_corrupted_folders(links, out_dir=out_dir)
elif status == 'unrecognized':
return get_unrecognized_folders(links, out_dir=out_dir)
raise ValueError('Status not recognized.')
try:
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
except KeyError:
raise ValueError('Status not recognized.')
@enforce_types

27
tests/test_update.py Normal file
View file

@ -0,0 +1,27 @@
import sqlite3
from .fixtures import *
def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
assert list((tmp_path / "archive").iterdir()) != []
subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
c = conn.cursor()
link = c.execute("SELECT * FROM core_snapshot").fetchone()
conn.commit()
conn.close()
assert link is None
update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict)
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
c = conn.cursor()
url = c.execute("SELECT * FROM core_snapshot").fetchone()[1]
conn.commit()
conn.close()
assert url == 'http://127.0.0.1:8080/static/example.com.html'