mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 21:38:33 +00:00
Add ArchiveResult Manager and sorted indexable filter
This commit is contained in:
parent
23a9beb4e0
commit
7903db6dfb
2 changed files with 17 additions and 3 deletions
|
@ -5,10 +5,11 @@ import uuid
|
|||
from django.db import models, transaction
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.text import slugify
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
from ..util import parse_date
|
||||
from ..index.schema import Link
|
||||
from ..extractors import get_default_archive_methods
|
||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
|
||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||
STATUS_CHOICES = [
|
||||
|
@ -91,7 +92,7 @@ class Snapshot(models.Model):
|
|||
return {
|
||||
key: getattr(self, key)
|
||||
if key != 'tags' else self.tags_str()
|
||||
for key in args
|
||||
for key in args
|
||||
}
|
||||
|
||||
def as_link(self) -> Link:
|
||||
|
@ -100,7 +101,7 @@ class Snapshot(models.Model):
|
|||
def as_link_with_details(self) -> Link:
|
||||
from ..index import load_link_details
|
||||
return load_link_details(self.as_link())
|
||||
|
||||
|
||||
def tags_str(self) -> str:
|
||||
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||
|
||||
|
@ -157,7 +158,15 @@ class Snapshot(models.Model):
|
|||
self.tags.clear()
|
||||
self.tags.add(*tags_id)
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
||||
|
||||
if sorted:
|
||||
precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
|
||||
return qs
|
||||
class ArchiveResult(models.Model):
|
||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||
cmd = models.JSONField()
|
||||
|
@ -169,5 +178,7 @@ class ArchiveResult(models.Model):
|
|||
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
||||
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
def __str__(self):
|
||||
return self.extractor
|
||||
|
|
|
@ -39,6 +39,7 @@ from .media import should_save_media, save_media
|
|||
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
||||
from .headers import should_save_headers, save_headers
|
||||
|
||||
|
||||
def get_default_archive_methods():
|
||||
return [
|
||||
('title', should_save_title, save_title),
|
||||
|
@ -56,6 +57,8 @@ def get_default_archive_methods():
|
|||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
]
|
||||
|
||||
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
|
||||
|
||||
@enforce_types
|
||||
def ignore_methods(to_ignore: List[str]):
|
||||
ARCHIVE_METHODS = get_default_archive_methods()
|
||||
|
|
Loading…
Add table
Reference in a new issue