ArchiveBox/archivebox/core/models.py

__package__ = 'archivebox.core'

import uuid

from django.db import models, transaction
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.core.cache import cache
from django.db.models import Case, When, Value, IntegerField

from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl
from ..index.schema import Link
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE

EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
STATUS_CHOICES = [
    ("succeeded", "succeeded"),
    ("failed", "failed"),
    ("skipped", "skipped")
]

try:
    JSONField = models.JSONField
except AttributeError:
    import jsonfield
    JSONField = jsonfield.JSONField


class Tag(models.Model):
    """
    Based on django-taggit model
    """
    name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
    slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)

    class Meta:
        verbose_name = "Tag"
        verbose_name_plural = "Tags"

    def __str__(self):
        return self.name

    def slugify(self, tag, i=None):
        slug = slugify(tag)
        if i is not None:
            slug += "_%d" % i
        return slug

    def save(self, *args, **kwargs):
        if self._state.adding and not self.slug:
            self.slug = self.slugify(self.name)

            with transaction.atomic():
                slugs = set(
                    type(self)
                    ._default_manager.filter(slug__startswith=self.slug)
                    .values_list("slug", flat=True)
                )

                i = None
                while True:
                    slug = self.slugify(self.name, i)
                    if slug not in slugs:
                        self.slug = slug
                        return super().save(*args, **kwargs)
                    i = 1 if i is None else i+1
        else:
            return super().save(*args, **kwargs)


class Snapshot(models.Model):
    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)

    url = models.URLField(unique=True)
    timestamp = models.CharField(max_length=32, unique=True, db_index=True)

    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)

    added = models.DateTimeField(auto_now_add=True, db_index=True)
    updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
    tags = models.ManyToManyField(Tag)

    keys = ('url', 'timestamp', 'title', 'tags', 'updated')

    def __repr__(self) -> str:
        title = self.title or '-'
        return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'

    def __str__(self) -> str:
        title = self.title or '-'
        return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'

    @classmethod
    def from_json(cls, info: dict):
        info = {k: v for k, v in info.items() if k in cls.keys}
        return cls(**info)

    def as_json(self, *args) -> dict:
        args = args or self.keys
        return {
            key: getattr(self, key)
            if key != 'tags' else self.tags_str()
            for key in args
        }

    def as_link(self) -> Link:
        return Link.from_json(self.as_json())

    def as_link_with_details(self) -> Link:
        from ..index import load_link_details
        return load_link_details(self.as_link())

    def tags_str(self) -> str:
        cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
        calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
        return cache.get_or_set(cache_key, calc_tags_str)

    @cached_property
    def bookmarked(self):
        return parse_date(self.timestamp)

    @cached_property
    def bookmarked_date(self):
        # TODO: remove this
        return self.bookmarked

    @cached_property
    def is_archived(self):
        return self.as_link().is_archived

    @cached_property
    def num_outputs(self):
        return self.archiveresult_set.filter(status='succeeded').count()

    @cached_property
    def url_hash(self):
        return hashurl(self.url)

    @cached_property
    def base_url(self):
        return base_url(self.url)

    @cached_property
    def link_dir(self):
        return str(ARCHIVE_DIR / self.timestamp)

    @cached_property
    def archive_path(self):
        return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)

    @cached_property
    def archive_size(self):
        cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'

        def calc_dir_size():
            try:
                return get_dir_size(self.link_dir)[0]
            except Exception:
                return 0

        return cache.get_or_set(cache_key, calc_dir_size)

    @cached_property
    def history(self):
        # TODO: use ArchiveResult for this instead of json
        return self.as_link_with_details().history

    @cached_property
    def latest_title(self):
        if self.title:
            return self.title   # whoopdedoo that was easy
        
        try:
            # take longest successful title from ArchiveResult db history
            return sorted(
                self.archiveresult_set\
                    .filter(extractor='title', status='succeeded', output__isnull=False)\
                    .values_list('output', flat=True),
                key=lambda r: len(r),
            )[-1]
        except IndexError:
            pass

        try:
            # take longest successful title from Link json index file history
            return sorted(
                (
                    result.output.strip()
                    for result in self.history['title']
                    if result.status == 'succeeded' and result.output.strip()
                ),
                key=lambda r: len(r),
            )[-1]
        except (KeyError, IndexError):
            pass

        return None

    def save_tags(self, tags=()):
        tags_id = []
        for tag in tags:
            tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
        self.tags.clear()
        self.tags.add(*tags_id)


class ArchiveResultManager(models.Manager):
    def indexable(self, sorted: bool = True):
        INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')

        if sorted:
            precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
        return qs


class ArchiveResult(models.Model):
    id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')
    uuid = models.UUIDField(default=uuid.uuid4, editable=False)

    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
    cmd = JSONField()
    pwd = models.CharField(max_length=256)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
    output = models.CharField(max_length=1024)
    start_ts = models.DateTimeField(db_index=True)
    end_ts = models.DateTimeField()
    status = models.CharField(max_length=16, choices=STATUS_CHOICES)

    objects = ArchiveResultManager()

    def __str__(self):
        return self.extractor
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00			`__package__ = 'archivebox.core'`

			`import uuid`

refactor: Remove django-taggit and replace it with a local tags setup 2020-10-12 18:47:03 +00:00			`from django.db import models, transaction`
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`from django.utils.functional import cached_property`
refactor: Remove django-taggit and replace it with a local tags setup 2020-10-12 18:47:03 +00:00			`from django.utils.text import slugify`
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 20:49:29 +00:00			`from django.core.cache import cache`
Add ArchiveResult Manager and sorted indexable filter 2020-11-23 18:04:38 +00:00			`from django.db.models import Case, When, Value, IntegerField`
Improved tags 2020-09-21 16:50:26 +00:00
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 20:49:29 +00:00			`from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME`
			`from ..system import get_dir_size`
compute snapshot properties directly without loading whole Link 2021-02-16 01:44:08 +00:00			`from ..util import parse_date, base_url, hashurl`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`from ..index.schema import Link`
Add ArchiveResult Manager and sorted indexable filter 2020-11-23 18:04:38 +00:00			`from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE`
feat: Add extractor field to the database 2020-11-04 12:28:02 +00:00
			`EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]`
fix: Update model according to code review 2020-11-23 23:28:43 +00:00			`STATUS_CHOICES = [`
			`("succeeded", "succeeded"),`
			`("failed", "failed"),`
			`("skipped", "skipped")`
			`]`
django admin to view links now working 2019-04-22 17:20:19 +00:00
fallback to old JSONField from lib if django version is old 2020-12-11 18:45:44 +00:00			`try:`
			`JSONField = models.JSONField`
			`except AttributeError:`
			`import jsonfield`
			`JSONField = jsonfield.JSONField`

first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00
refactor: Remove django-taggit and replace it with a local tags setup 2020-10-12 18:47:03 +00:00			`class Tag(models.Model):`
			`"""`
			`Based on django-taggit model`
			`"""`
			`name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)`
			`slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)`
Improved tags 2020-09-21 16:50:26 +00:00
			`class Meta:`
			`verbose_name = "Tag"`
			`verbose_name_plural = "Tags"`

refactor: Remove django-taggit and replace it with a local tags setup 2020-10-12 18:47:03 +00:00			`def __str__(self):`
			`return self.name`

			`def slugify(self, tag, i=None):`
			`slug = slugify(tag)`
			`if i is not None:`
			`slug += "_%d" % i`
			`return slug`

			`def save(self, args, *kwargs):`
			`if self._state.adding and not self.slug:`
			`self.slug = self.slugify(self.name)`

			`with transaction.atomic():`
			`slugs = set(`
			`type(self)`
			`._default_manager.filter(slug__startswith=self.slug)`
			`.values_list("slug", flat=True)`
			`)`

			`i = None`
			`while True:`
			`slug = self.slugify(self.name, i)`
			`if slug not in slugs:`
			`self.slug = slug`
			`return super().save(args, *kwargs)`
			`i = 1 if i is None else i+1`
			`else:`
			`return super().save(args, *kwargs)`

feat: Remove walrus operator (we still need to support python3.7) 2020-12-06 17:23:02 +00:00
rename model Page to Snapshot 2019-05-01 03:44:51 +00:00			`class Snapshot(models.Model):`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00			`id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)`

			`url = models.URLField(unique=True)`
make snapshots unique again 2020-07-13 16:21:52 +00:00			`timestamp = models.CharField(max_length=32, unique=True, db_index=True)`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00
increase max title length to 512 2021-02-18 07:33:08 +00:00			`title = models.CharField(max_length=512, null=True, blank=True, db_index=True)`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00
rename archivebox-info to archivebox-status 2020-06-26 03:32:01 +00:00			`added = models.DateTimeField(auto_now_add=True, db_index=True)`
add snapshot_id to Link and uuid to ArchiveResult 2021-02-16 20:54:27 +00:00			`updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)`
refactor: Remove django-taggit and replace it with a local tags setup 2020-10-12 18:47:03 +00:00			`tags = models.ManyToManyField(Tag)`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00
clearer sql parsing and dumping 2019-04-17 07:50:41 +00:00			`keys = ('url', 'timestamp', 'title', 'tags', 'updated')`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00
django admin to view links now working 2019-04-22 17:20:19 +00:00			`def __repr__(self) -> str:`
fix missing imports 2020-06-30 09:55:34 +00:00			`title = self.title or '-'`
			`return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'`
django admin to view links now working 2019-04-22 17:20:19 +00:00
			`def __str__(self) -> str:`
fix missing imports 2020-06-30 09:55:34 +00:00			`title = self.title or '-'`
			`return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'`
django admin to view links now working 2019-04-22 17:20:19 +00:00
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00			`@classmethod`
			`def from_json(cls, info: dict):`
clearer sql parsing and dumping 2019-04-17 07:50:41 +00:00			`info = {k: v for k, v in info.items() if k in cls.keys}`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00			`return cls(**info)`

			`def as_json(self, *args) -> dict:`
clearer sql parsing and dumping 2019-04-17 07:50:41 +00:00			`args = args or self.keys`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00			`return {`
			`key: getattr(self, key)`
config fixes 2020-10-31 11:55:27 +00:00			`if key != 'tags' else self.tags_str()`
Add ArchiveResult Manager and sorted indexable filter 2020-11-23 18:04:38 +00:00			`for key in args`
first working django model with archivebox-shell command and sql exporting 2019-04-17 07:49:18 +00:00			`}`
django admin to view links now working 2019-04-22 17:20:19 +00:00
			`def as_link(self) -> Link:`
			`return Link.from_json(self.as_json())`

fix: Save history in main index (to mimic previous behaviour) 2020-08-28 16:08:03 +00:00			`def as_link_with_details(self) -> Link:`
			`from ..index import load_link_details`
			`return load_link_details(self.as_link())`
Add ArchiveResult Manager and sorted indexable filter 2020-11-23 18:04:38 +00:00
config fixes 2020-10-31 11:55:27 +00:00			`def tags_str(self) -> str:`
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 20:49:29 +00:00			`cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'`
			`calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))`
			`return cache.get_or_set(cache_key, calc_tags_str)`
fix: Save history in main index (to mimic previous behaviour) 2020-08-28 16:08:03 +00:00
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
customize django admin ui chrome 2019-04-23 01:40:42 +00:00			`def bookmarked(self):`
			`return parse_date(self.timestamp)`

compute snapshot properties directly without loading whole Link 2021-02-16 01:44:08 +00:00			`@cached_property`
			`def bookmarked_date(self):`
			`# TODO: remove this`
			`return self.bookmarked`

improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
django admin to view links now working 2019-04-22 17:20:19 +00:00			`def is_archived(self):`
			`return self.as_link().is_archived`

improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
django admin to view links now working 2019-04-22 17:20:19 +00:00			`def num_outputs(self):`
use database for num_outputs instead of legacy json 2020-11-28 06:05:53 +00:00			`return self.archiveresult_set.filter(status='succeeded').count()`
django admin to view links now working 2019-04-22 17:20:19 +00:00
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
django admin to view links now working 2019-04-22 17:20:19 +00:00			`def url_hash(self):`
compute snapshot properties directly without loading whole Link 2021-02-16 01:44:08 +00:00			`return hashurl(self.url)`
django admin to view links now working 2019-04-22 17:20:19 +00:00
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
django admin to view links now working 2019-04-22 17:20:19 +00:00			`def base_url(self):`
compute snapshot properties directly without loading whole Link 2021-02-16 01:44:08 +00:00			`return base_url(self.url)`
split up utils into separate files 2019-05-01 03:13:04 +00:00
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`def link_dir(self):`
compute snapshot properties directly without loading whole Link 2021-02-16 01:44:08 +00:00			`return str(ARCHIVE_DIR / self.timestamp)`
rename archivebox-info to archivebox-status 2020-06-26 03:32:01 +00:00
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
rename archivebox-info to archivebox-status 2020-06-26 03:32:01 +00:00			`def archive_path(self):`
inline archive_size and archive_path snapshot methods 2021-02-16 11:16:02 +00:00			`return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)`
rename archivebox-info to archivebox-status 2020-06-26 03:32:01 +00:00
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`@cached_property`
rename archivebox-info to archivebox-status 2020-06-26 03:32:01 +00:00			`def archive_size(self):`
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 20:49:29 +00:00			`cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'`

			`def calc_dir_size():`
			`try:`
			`return get_dir_size(self.link_dir)[0]`
			`except Exception:`
			`return 0`

			`return cache.get_or_set(cache_key, calc_dir_size)`
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00
			`@cached_property`
			`def history(self):`
use database for num_outputs instead of legacy json 2020-11-28 06:05:53 +00:00			`# TODO: use ArchiveResult for this instead of json`
simplify history helper 2020-11-28 06:14:45 +00:00			`return self.as_link_with_details().history`
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00
			`@cached_property`
			`def latest_title(self):`
speed up latest_title by preferring db title if present 2021-02-16 20:49:53 +00:00			`if self.title:`
improve latest title logic to take longest title 2021-02-18 07:33:28 +00:00			`return self.title # whoopdedoo that was easy`
speed up latest_title by preferring db title if present 2021-02-16 20:49:53 +00:00
			`try:`
improve latest title logic to take longest title 2021-02-18 07:33:28 +00:00			`# take longest successful title from ArchiveResult db history`
			`return sorted(`
			`self.archiveresult_set\`
			`.filter(extractor='title', status='succeeded', output__isnull=False)\`
			`.values_list('output', flat=True),`
			`key=lambda r: len(r),`
			`)[-1]`
			`except IndexError:`
speed up latest_title by preferring db title if present 2021-02-16 20:49:53 +00:00			`pass`

improve latest title logic to take longest title 2021-02-18 07:33:28 +00:00			`try:`
			`# take longest successful title from Link json index file history`
			`return sorted(`
			`(`
			`result.output.strip()`
			`for result in self.history['title']`
			`if result.status == 'succeeded' and result.output.strip()`
			`),`
			`key=lambda r: len(r),`
			`)[-1]`
			`except (KeyError, IndexError):`
			`pass`
speed up latest_title by preferring db title if present 2021-02-16 20:49:53 +00:00
improve sort columns and UI placeholders 2020-06-30 10:41:48 +00:00			`return None`
refactor: Remove django-taggit and replace it with a local tags setup 2020-10-12 18:47:03 +00:00
default function args can never be mutable 2020-11-28 06:06:11 +00:00			`def save_tags(self, tags=()):`
refactor: Remove django-taggit and replace it with a local tags setup 2020-10-12 18:47:03 +00:00			`tags_id = []`
			`for tag in tags:`
			`tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)`
			`self.tags.clear()`
			`self.tags.add(*tags_id)`
feat: Initial (and naive) ArchiveResult model 2020-11-03 14:54:02 +00:00
feat: Remove walrus operator (we still need to support python3.7) 2020-12-06 17:23:02 +00:00
Add ArchiveResult Manager and sorted indexable filter 2020-11-23 18:04:38 +00:00			`class ArchiveResultManager(models.Manager):`
			`def indexable(self, sorted: bool = True):`
			`INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]`
			`qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')`
feat: Initial (and naive) ArchiveResult model 2020-11-03 14:54:02 +00:00
Add ArchiveResult Manager and sorted indexable filter 2020-11-23 18:04:38 +00:00			`if sorted:`
			`precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]`
			`qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')`
			`return qs`
feat: Remove walrus operator (we still need to support python3.7) 2020-12-06 17:23:02 +00:00

feat: Initial (and naive) ArchiveResult model 2020-11-03 14:54:02 +00:00			`class ArchiveResult(models.Model):`
add snapshot_id to Link and uuid to ArchiveResult 2021-02-16 20:54:27 +00:00			`id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')`
			`uuid = models.UUIDField(default=uuid.uuid4, editable=False)`

feat: Initial (and naive) ArchiveResult model 2020-11-03 14:54:02 +00:00			`snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)`
fix ArchiveResult extractor showing up on top of dropdown in admin inline form 2021-02-17 23:25:01 +00:00			`extractor = models.CharField(choices=EXTRACTORS, max_length=32)`
fallback to old JSONField from lib if django version is old 2020-12-11 18:45:44 +00:00			`cmd = JSONField()`
Apply suggestions from code review minor nit 2020-11-27 05:01:34 +00:00			`pwd = models.CharField(max_length=256)`
add snapshot_id to Link and uuid to ArchiveResult 2021-02-16 20:54:27 +00:00			`cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)`
			`output = models.CharField(max_length=1024)`
			`start_ts = models.DateTimeField(db_index=True)`
feat: Add extractor field to the database 2020-11-04 12:28:02 +00:00			`end_ts = models.DateTimeField()`
fix: Update model according to code review 2020-11-23 23:28:43 +00:00			`status = models.CharField(max_length=16, choices=STATUS_CHOICES)`
feat: initial functional version with icons calculated based on archive results 2020-11-04 15:31:20 +00:00
Add ArchiveResult Manager and sorted indexable filter 2020-11-23 18:04:38 +00:00			`objects = ArchiveResultManager()`

feat: initial functional version with icons calculated based on archive results 2020-11-04 15:31:20 +00:00			`def __str__(self):`
			`return self.extractor`