2019-04-27 21:26:24 +00:00
|
|
|
__package__ = 'archivebox.index'
|
2019-04-17 07:49:18 +00:00
|
|
|
|
2021-12-30 20:19:48 +00:00
|
|
|
import re
|
|
|
|
|
2019-04-24 08:07:46 +00:00
|
|
|
from io import StringIO
|
2020-09-03 22:26:49 +00:00
|
|
|
from pathlib import Path
|
2019-04-24 08:07:46 +00:00
|
|
|
from typing import List, Tuple, Iterator
|
2020-08-21 17:42:08 +00:00
|
|
|
from django.db.models import QuerySet
|
2020-12-06 00:11:36 +00:00
|
|
|
from django.db import transaction
|
2019-04-17 07:49:18 +00:00
|
|
|
|
2019-04-27 21:26:24 +00:00
|
|
|
from .schema import Link
|
2021-02-16 11:20:05 +00:00
|
|
|
from ..util import enforce_types, parse_date
|
2021-12-30 20:19:48 +00:00
|
|
|
from ..config import (
|
|
|
|
OUTPUT_DIR,
|
2022-01-06 14:14:41 +00:00
|
|
|
TAG_SEPARATOR_PATTERN,
|
2021-12-30 20:19:48 +00:00
|
|
|
)
|
2019-04-17 07:49:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
### Main Links Index
|
|
|
|
|
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
2019-05-01 03:44:51 +00:00
|
|
|
from core.models import Snapshot
|
2019-04-17 07:49:18 +00:00
|
|
|
|
|
|
|
return (
|
2019-05-01 03:44:51 +00:00
|
|
|
Link.from_json(page.as_json(*Snapshot.keys))
|
|
|
|
for page in Snapshot.objects.all()
|
2019-04-17 07:49:18 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
@enforce_types
|
2021-03-01 03:54:40 +00:00
|
|
|
def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
|
|
|
if atomic:
|
|
|
|
with transaction.atomic():
|
|
|
|
return snapshots.delete()
|
|
|
|
return snapshots.delete()
|
2020-07-23 20:07:00 +00:00
|
|
|
|
2020-08-22 13:59:25 +00:00
|
|
|
@enforce_types
|
|
|
|
def write_link_to_sql_index(link: Link):
|
2021-02-16 11:20:05 +00:00
|
|
|
from core.models import Snapshot, ArchiveResult
|
2020-08-22 13:59:25 +00:00
|
|
|
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
2021-12-20 14:58:58 +00:00
|
|
|
|
2021-12-23 17:17:55 +00:00
|
|
|
tag_list = list(dict.fromkeys(
|
2022-01-06 14:14:41 +00:00
|
|
|
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
|
2021-12-23 17:17:55 +00:00
|
|
|
))
|
2021-12-23 17:29:17 +00:00
|
|
|
info.pop('tags')
|
2020-10-12 18:47:03 +00:00
|
|
|
|
2020-08-22 13:59:25 +00:00
|
|
|
try:
|
2024-05-13 12:12:12 +00:00
|
|
|
snapshot = Snapshot.objects.get(url=link.url)
|
|
|
|
info["timestamp"] = snapshot.timestamp
|
2020-08-22 13:59:25 +00:00
|
|
|
except Snapshot.DoesNotExist:
|
|
|
|
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
|
|
|
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
|
|
|
|
2021-03-01 03:54:40 +00:00
|
|
|
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
2021-12-20 14:58:58 +00:00
|
|
|
snapshot.save_tags(tag_list)
|
2021-02-16 11:20:05 +00:00
|
|
|
|
|
|
|
for extractor, entries in link.history.items():
|
|
|
|
for entry in entries:
|
|
|
|
if isinstance(entry, dict):
|
|
|
|
result, _ = ArchiveResult.objects.get_or_create(
|
2024-05-13 12:12:12 +00:00
|
|
|
snapshot_id=snapshot.pk,
|
2021-02-16 11:20:05 +00:00
|
|
|
extractor=extractor,
|
|
|
|
start_ts=parse_date(entry['start_ts']),
|
|
|
|
defaults={
|
|
|
|
'end_ts': parse_date(entry['end_ts']),
|
|
|
|
'cmd': entry['cmd'],
|
|
|
|
'output': entry['output'],
|
|
|
|
'cmd_version': entry.get('cmd_version') or 'unknown',
|
|
|
|
'pwd': entry['pwd'],
|
|
|
|
'status': entry['status'],
|
|
|
|
}
|
|
|
|
)
|
|
|
|
else:
|
2021-02-18 09:26:56 +00:00
|
|
|
result, _ = ArchiveResult.objects.update_or_create(
|
2024-05-13 12:12:12 +00:00
|
|
|
snapshot_id=snapshot.pk,
|
2021-02-16 11:20:05 +00:00
|
|
|
extractor=extractor,
|
|
|
|
start_ts=parse_date(entry.start_ts),
|
|
|
|
defaults={
|
|
|
|
'end_ts': parse_date(entry.end_ts),
|
|
|
|
'cmd': entry.cmd,
|
|
|
|
'output': entry.output,
|
|
|
|
'cmd_version': entry.cmd_version or 'unknown',
|
|
|
|
'pwd': entry.pwd,
|
|
|
|
'status': entry.status,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2020-10-12 18:47:03 +00:00
|
|
|
return snapshot
|
2020-08-22 13:59:25 +00:00
|
|
|
|
|
|
|
|
2020-07-23 20:07:00 +00:00
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
2021-03-01 03:54:40 +00:00
|
|
|
for link in links:
|
|
|
|
# with transaction.atomic():
|
|
|
|
# write_link_to_sql_index(link)
|
|
|
|
write_link_to_sql_index(link)
|
2020-08-22 13:59:25 +00:00
|
|
|
|
2019-04-27 21:26:24 +00:00
|
|
|
|
2020-06-30 17:45:47 +00:00
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
|
2020-06-30 17:45:47 +00:00
|
|
|
from core.models import Snapshot
|
|
|
|
|
2021-03-01 03:54:40 +00:00
|
|
|
# with transaction.atomic():
|
|
|
|
# try:
|
|
|
|
# snap = Snapshot.objects.get(url=link.url)
|
|
|
|
# except Snapshot.DoesNotExist:
|
|
|
|
# snap = write_link_to_sql_index(link)
|
|
|
|
# snap.title = link.title
|
|
|
|
try:
|
|
|
|
snap = Snapshot.objects.get(url=link.url)
|
|
|
|
except Snapshot.DoesNotExist:
|
|
|
|
snap = write_link_to_sql_index(link)
|
2024-01-04 04:31:46 +00:00
|
|
|
|
2021-03-01 03:54:40 +00:00
|
|
|
snap.title = link.title
|
2020-09-21 16:50:26 +00:00
|
|
|
|
2024-01-04 04:31:46 +00:00
|
|
|
tag_list = list(
|
|
|
|
{tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')}
|
|
|
|
| set(snap.tags.values_list('name', flat=True))
|
|
|
|
)
|
2020-09-21 16:50:26 +00:00
|
|
|
|
2021-03-01 03:54:40 +00:00
|
|
|
snap.save()
|
|
|
|
snap.save_tags(tag_list)
|
2020-06-30 17:45:47 +00:00
|
|
|
|
2019-04-24 08:07:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]:
|
2019-04-24 08:07:46 +00:00
|
|
|
from django.core.management import call_command
|
|
|
|
out = StringIO()
|
|
|
|
call_command("showmigrations", list=True, stdout=out)
|
|
|
|
out.seek(0)
|
|
|
|
migrations = []
|
|
|
|
for line in out.readlines():
|
|
|
|
if line.strip() and ']' in line:
|
|
|
|
status_str, name_str = line.strip().split(']', 1)
|
|
|
|
is_applied = 'X' in status_str
|
|
|
|
migration_name = name_str.strip()
|
|
|
|
migrations.append((is_applied, migration_name))
|
|
|
|
|
|
|
|
return migrations
|
|
|
|
|
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
2019-04-24 08:07:46 +00:00
|
|
|
from django.core.management import call_command
|
|
|
|
null, out = StringIO(), StringIO()
|
2024-06-03 10:02:00 +00:00
|
|
|
try:
|
|
|
|
call_command("makemigrations", interactive=False, stdout=null)
|
|
|
|
except Exception as e:
|
|
|
|
print('[!] Failed to create some migrations. Please open an issue and copy paste this output for help: {}'.format(e))
|
|
|
|
print()
|
|
|
|
|
2019-04-24 08:07:46 +00:00
|
|
|
call_command("migrate", interactive=False, stdout=out)
|
|
|
|
out.seek(0)
|
|
|
|
|
|
|
|
return [line.strip() for line in out.readlines() if line.strip()]
|
2019-04-24 15:37:30 +00:00
|
|
|
|
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def get_admins(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
2019-04-24 15:37:30 +00:00
|
|
|
from django.contrib.auth.models import User
|
|
|
|
return User.objects.filter(is_superuser=True)
|