ArchiveBox/archivebox/core/migrations/0007_archiveresult.py

117 lines
4.6 KiB
Python
Raw Normal View History

# Generated by Django 3.0.8 on 2020-11-04 12:25
2024-09-30 22:59:05 +00:00
import os
import json
from pathlib import Path
from django.db import migrations, models
import django.db.models.deletion
from index.json import to_json
DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir
2024-09-30 22:59:05 +00:00
ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir
2020-12-11 18:50:45 +00:00
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
def forwards_func(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult")
snapshots = Snapshot.objects.all()
for snapshot in snapshots:
2024-09-30 22:59:05 +00:00
out_dir = ARCHIVE_DIR / snapshot.timestamp
try:
with open(out_dir / "index.json", "r") as f:
fs_index = json.load(f)
except Exception as e:
continue
history = fs_index["history"]
for extractor in history:
for result in history[extractor]:
try:
ArchiveResult.objects.create(
extractor=extractor,
snapshot=snapshot,
pwd=result["pwd"],
cmd=result.get("cmd") or [],
cmd_version=result.get("cmd_version") or 'unknown',
start_ts=result["start_ts"],
end_ts=result["end_ts"],
status=result["status"],
output=result.get("output") or 'null',
)
except Exception as e:
print(
' ! Skipping import due to missing/invalid index.json:',
out_dir,
e,
'(open an issue with this index.json for help)',
)
def verify_json_index_integrity(snapshot):
results = snapshot.archiveresult_set.all()
2024-09-30 22:59:05 +00:00
out_dir = ARCHIVE_DIR / snapshot.timestamp
with open(out_dir / "index.json", "r") as f:
index = json.load(f)
history = index["history"]
index_results = [result for extractor in history for result in history[extractor]]
flattened_results = [result["start_ts"] for result in index_results]
missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
for missing in missing_results:
index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
"start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
"schema": "ArchiveResult", "status": missing.status})
json_index = to_json(index)
with open(out_dir / "index.json", "w") as f:
f.write(json_index)
def reverse_func(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult")
for snapshot in Snapshot.objects.all():
verify_json_index_integrity(snapshot)
ArchiveResult.objects.all().delete()
class Migration(migrations.Migration):
dependencies = [
('core', '0006_auto_20201012_1520'),
]
operations = [
migrations.CreateModel(
name='ArchiveResult',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
2020-12-11 18:50:45 +00:00
('cmd', JSONField()),
('pwd', models.CharField(max_length=256)),
('cmd_version', models.CharField(max_length=32)),
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
('output', models.CharField(max_length=512)),
('start_ts', models.DateTimeField()),
('end_ts', models.DateTimeField()),
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
],
),
migrations.RunPython(forwards_func, reverse_func),
]