From 65afd405b1f09ca8801c9733ac1c913b39198148 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 18 Nov 2024 19:23:14 -0800 Subject: [PATCH] merge seeds and crawls apps --- archivebox/__init__.py | 1 - archivebox/api/v1_crawls.py | 7 +- archivebox/cli/archivebox_add.py | 3 +- archivebox/core/settings.py | 3 +- archivebox/crawls/admin.py | 59 +++++++++++- archivebox/crawls/models.py | 107 +++++++++++++++++++++- archivebox/search/__init__.py | 2 +- archivebox/search/admin.py | 3 +- archivebox/seeds/__init__.py | 12 --- archivebox/seeds/admin.py | 68 -------------- archivebox/seeds/apps.py | 6 -- archivebox/seeds/migrations/__init__.py | 0 archivebox/seeds/models.py | 115 ------------------------ archivebox/seeds/tests.py | 3 - archivebox/seeds/views.py | 3 - 15 files changed, 168 insertions(+), 224 deletions(-) delete mode 100644 archivebox/seeds/__init__.py delete mode 100644 archivebox/seeds/admin.py delete mode 100644 archivebox/seeds/apps.py delete mode 100644 archivebox/seeds/migrations/__init__.py delete mode 100644 archivebox/seeds/models.py delete mode 100644 archivebox/seeds/tests.py delete mode 100644 archivebox/seeds/views.py diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 8c65a60f..066c2ee7 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -85,7 +85,6 @@ ARCHIVEBOX_BUILTIN_PLUGINS = { 'workers': PACKAGE_DIR / 'workers', 'core': PACKAGE_DIR / 'core', 'crawls': PACKAGE_DIR / 'crawls', - 'seeds': PACKAGE_DIR / 'seeds', # 'search': PACKAGE_DIR / 'search', # 'core': PACKAGE_DIR / 'core', } diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py index 97e95a6a..2c8ac63d 100644 --- a/archivebox/api/v1_crawls.py +++ b/archivebox/api/v1_crawls.py @@ -10,8 +10,7 @@ from django.contrib.auth import get_user_model from ninja import Router, Schema from core.models import Snapshot -from crawls.models import Crawl -from seeds.models import Seed +from crawls.models import Seed, Crawl from .auth import API_AUTH_METHODS @@ -19,7 +18,7 @@ router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS) class SeedSchema(Schema): - TYPE: str = 'seeds.models.Seed' + TYPE: str = 'crawls.models.Seed' id: UUID abid: str @@ -60,7 +59,7 @@ def get_seed(request, seed_id: str): class CrawlSchema(Schema): - TYPE: str = 'core.models.Crawl' + TYPE: str = 'crawls.models.Crawl' id: UUID abid: str diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index db0bb305..c90ed323 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -51,8 +51,7 @@ def add(urls: str | list[str], setup_django() check_data_folder() - from seeds.models import Seed - from crawls.models import Crawl + from crawls.models import Seed, Crawl from workers.orchestrator import Orchestrator from abid_utils.models import get_or_create_system_user_pk diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 43853df2..22b0d9a4 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -65,8 +65,7 @@ INSTALLED_APPS = [ 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'workers', # handles starting and managing background workers and processes (orchestrators and actors) - 'seeds', # handles Seed model and URL source management - 'crawls', # handles Crawl and CrawlSchedule models and management + 'crawls', # handles Seed, Crawl, and CrawlSchedule models and management 'personas', # handles Persona and session management 'core', # core django model with Snapshot, ArchiveResult, etc. 'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index c08cfbde..34221fa6 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -1,7 +1,5 @@ __package__ = 'archivebox.crawls' -import abx - from django.utils.html import format_html, format_html_join from django.contrib import admin @@ -10,7 +8,59 @@ from archivebox import DATA_DIR from abid_utils.admin import ABIDModelAdmin from core.models import Snapshot -from crawls.models import Crawl, CrawlSchedule +from crawls.models import Seed, Crawl, CrawlSchedule + + +class SeedAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots') + sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str') + search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str') + + readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents') + fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields) + + list_filter = ('extractor', 'created_by') + ordering = ['-created_at'] + list_per_page = 100 + actions = ["delete_selected"] + + def num_crawls(self, obj): + return obj.crawl_set.count() + + def num_snapshots(self, obj): + return obj.snapshot_set.count() + + def scheduled_crawls(self, obj): + return format_html_join('
', ' - {}', ( + (scheduledcrawl.admin_change_url, scheduledcrawl) + for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20] + )) or format_html('No Scheduled Crawls yet...') + + def crawls(self, obj): + return format_html_join('
', ' - {}', ( + (crawl.admin_change_url, crawl) + for crawl in obj.crawl_set.all().order_by('-created_at')[:20] + )) or format_html('No Crawls yet...') + + def snapshots(self, obj): + return format_html_join('
', ' - {}', ( + (snapshot.admin_change_url, snapshot) + for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] + )) or format_html('No Snapshots yet...') + + def contents(self, obj): + if obj.uri.startswith('file:///data/'): + source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1) + contents = "" + try: + contents = source_file.read_text().strip()[:14_000] + except Exception as e: + contents = f'Error reading {source_file}: {e}' + + return format_html('{}:
{}
', source_file, contents) + + return format_html('See URLs here: {}', obj.uri, obj.uri) + @@ -102,7 +152,8 @@ class CrawlScheduleAdmin(ABIDModelAdmin): for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20] )) or format_html('No Snapshots yet...') -@abx.hookimpl + def register_admin(admin_site): + admin_site.register(Seed, SeedAdmin) admin_site.register(Crawl, CrawlAdmin) admin_site.register(CrawlSchedule, CrawlScheduleAdmin) diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index e0f8a299..d37908af 100644 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.crawls' from typing import TYPE_CHECKING +from pathlib import Path from django_stubs_ext.db.models import TypedModelMeta from django.db import models @@ -12,12 +13,114 @@ from django.utils import timezone from workers.models import ModelWithStateMachine +from archivebox.config import CONSTANTS +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk + if TYPE_CHECKING: from core.models import Snapshot, ArchiveResult -from seeds.models import Seed -from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats + +class Seed(ABIDModel, ModelWithHealthStats): + """ + A fountain that produces URLs (+metadata) each time it's queried e.g. + - file:///data/sources/2024-01-02_11-57-51__cli_add.txt + - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt + - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks + - https://getpocket.com/user/nikisweeting/feed + - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + - ... + Each query of a Seed can produce the same list of URLs, or a different list each time. + The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots. + + When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI. + The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks. + The outlinks then get turned into new pending Snapshots under the same crawl, + and the cycle repeats until Crawl.max_depth. + + Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to + stateful remote services, files with contents that change, directories that have new files within, etc. + """ + + abid_prefix = 'src_' + abid_ts_src = 'self.created_at' + abid_uri_src = 'self.uri' + abid_subtype_src = 'self.extractor' + abid_rand_src = 'self.id' + abid_drift_allowed = True + + id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') + abid = ABIDField(prefix=abid_prefix) + + uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from + label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed') + notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have') + + extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)') + tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source') + config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source') + + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + + + crawl_set: models.Manager['Crawl'] + + class Meta: + verbose_name = 'Seed' + verbose_name_plural = 'Seeds' + + unique_together = (('created_by', 'uri', 'extractor'),) + + + @classmethod + def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None): + source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data') + + seed, _ = cls.objects.get_or_create( + label=label or source_file.name, + uri=f'file://{source_path}', + created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(), + extractor=parser, + tags_str=tag, + config=config or {}, + ) + seed.save() + return seed + + @property + def source_type(self): + # e.g. http/https:// + # file:// + # pocketapi:// + # s3:// + # etc.. + return self.uri.split('://', 1)[0].lower() + + @property + def api_url(self) -> str: + # /api/v1/core/seed/{uulid} + return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}' + + @property + def api_docs_url(self) -> str: + return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed' + + @property + def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']: + from crawls.models import CrawlSchedule + return CrawlSchedule.objects.filter(template__seed_id=self.pk) + + @property + def snapshot_set(self) -> QuerySet['Snapshot']: + from core.models import Snapshot + + crawl_ids = self.crawl_set.values_list('pk', flat=True) + return Snapshot.objects.filter(crawl_id__in=crawl_ids) + + + class CrawlSchedule(ABIDModel, ModelWithHealthStats): diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 921c074f..ea059db1 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -34,7 +34,7 @@ def get_file_result_content(res, extra_path, use_pwd=False): return [] -# This should be abstracted by a plugin interface for extractors +# TODO: This should be abstracted by a plugin interface for extractors @enforce_types def get_indexable_content(results: QuerySet): if not results: diff --git a/archivebox/search/admin.py b/archivebox/search/admin.py index 42aadf6f..0f7bcc8c 100644 --- a/archivebox/search/admin.py +++ b/archivebox/search/admin.py @@ -1,10 +1,11 @@ __package__ = 'archivebox.search' from django.contrib import messages +from django.contrib import admin from archivebox.search import query_search_index -class SearchResultsAdminMixin: +class SearchResultsAdminMixin(admin.ModelAdmin): def get_search_results(self, request, queryset, search_term: str): """Enhances the search queryset with results from the search backend""" diff --git a/archivebox/seeds/__init__.py b/archivebox/seeds/__init__.py deleted file mode 100644 index 7c3cd823..00000000 --- a/archivebox/seeds/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ - -__package__ = 'archivebox.seeds' -__order__ = 100 - -import abx - - -@abx.hookimpl -def register_admin(admin_site): - from .admin import register_admin as register_seeds_admin - register_seeds_admin(admin_site) - diff --git a/archivebox/seeds/admin.py b/archivebox/seeds/admin.py deleted file mode 100644 index 84f76c46..00000000 --- a/archivebox/seeds/admin.py +++ /dev/null @@ -1,68 +0,0 @@ -__package__ = 'archivebox.seeds' - -import abx - -from django.utils.html import format_html_join, format_html - -from abid_utils.admin import ABIDModelAdmin - -from archivebox import DATA_DIR - -from seeds.models import Seed - - - -class SeedAdmin(ABIDModelAdmin): - list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots') - sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str') - search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str') - - readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents') - fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields) - - list_filter = ('extractor', 'created_by') - ordering = ['-created_at'] - list_per_page = 100 - actions = ["delete_selected"] - - def num_crawls(self, obj): - return obj.crawl_set.count() - - def num_snapshots(self, obj): - return obj.snapshot_set.count() - - def scheduled_crawls(self, obj): - return format_html_join('
', ' - {}', ( - (scheduledcrawl.admin_change_url, scheduledcrawl) - for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20] - )) or format_html('No Scheduled Crawls yet...') - - def crawls(self, obj): - return format_html_join('
', ' - {}', ( - (crawl.admin_change_url, crawl) - for crawl in obj.crawl_set.all().order_by('-created_at')[:20] - )) or format_html('No Crawls yet...') - - def snapshots(self, obj): - return format_html_join('
', ' - {}', ( - (snapshot.admin_change_url, snapshot) - for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] - )) or format_html('No Snapshots yet...') - - def contents(self, obj): - if obj.uri.startswith('file:///data/'): - source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1) - contents = "" - try: - contents = source_file.read_text().strip()[:14_000] - except Exception as e: - contents = f'Error reading {source_file}: {e}' - - return format_html('{}:
{}
', source_file, contents) - - return format_html('See URLs here: {}', obj.uri, obj.uri) - - -@abx.hookimpl -def register_admin(admin_site): - admin_site.register(Seed, SeedAdmin) diff --git a/archivebox/seeds/apps.py b/archivebox/seeds/apps.py deleted file mode 100644 index 38eb4fde..00000000 --- a/archivebox/seeds/apps.py +++ /dev/null @@ -1,6 +0,0 @@ -from django.apps import AppConfig - - -class SeedsConfig(AppConfig): - default_auto_field = "django.db.models.BigAutoField" - name = "seeds" diff --git a/archivebox/seeds/migrations/__init__.py b/archivebox/seeds/migrations/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/seeds/models.py b/archivebox/seeds/models.py deleted file mode 100644 index ce96c913..00000000 --- a/archivebox/seeds/models.py +++ /dev/null @@ -1,115 +0,0 @@ -__package__ = 'archivebox.seeds' - -from typing import TYPE_CHECKING -from pathlib import Path - -from django.db import models -from django.db.models import QuerySet -from django.conf import settings -from django.urls import reverse_lazy - -from archivebox.config import CONSTANTS -from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk - -if TYPE_CHECKING: - from crawls.models import Crawl, CrawlSchedule - from core.models import Snapshot - - -class Seed(ABIDModel, ModelWithHealthStats): - """ - A fountain that produces URLs (+metadata) each time it's queried e.g. - - file:///data/sources/2024-01-02_11-57-51__cli_add.txt - - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt - - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks - - https://getpocket.com/user/nikisweeting/feed - - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml - - ... - Each query of a Seed can produce the same list of URLs, or a different list each time. - The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots. - - When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI. - The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks. - The outlinks then get turned into new pending Snapshots under the same crawl, - and the cycle repeats until Crawl.max_depth. - - Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to - stateful remote services, files with contents that change, directories that have new files within, etc. - """ - - abid_prefix = 'src_' - abid_ts_src = 'self.created_at' - abid_uri_src = 'self.uri' - abid_subtype_src = 'self.extractor' - abid_rand_src = 'self.id' - abid_drift_allowed = True - - id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') - abid = ABIDField(prefix=abid_prefix) - - uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from - label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed') - notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have') - - extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)') - tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source') - config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source') - - created_at = AutoDateTimeField(default=None, null=False, db_index=True) - modified_at = models.DateTimeField(auto_now=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) - - - crawl_set: models.Manager['Crawl'] - - class Meta: - verbose_name = 'Seed' - verbose_name_plural = 'Seeds' - - unique_together = (('created_by', 'uri', 'extractor'),) - - - @classmethod - def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None): - source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data') - - seed, _ = cls.objects.get_or_create( - label=label or source_file.name, - uri=f'file://{source_path}', - created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(), - extractor=parser, - tags_str=tag, - config=config or {}, - ) - seed.save() - return seed - - @property - def source_type(self): - # e.g. http/https:// - # file:// - # pocketapi:// - # s3:// - # etc.. - return self.uri.split('://', 1)[0].lower() - - @property - def api_url(self) -> str: - # /api/v1/core/seed/{uulid} - return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}' - - @property - def api_docs_url(self) -> str: - return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed' - - @property - def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']: - from crawls.models import CrawlSchedule - return CrawlSchedule.objects.filter(template__seed_id=self.pk) - - @property - def snapshot_set(self) -> QuerySet['Snapshot']: - from core.models import Snapshot - - crawl_ids = self.crawl_set.values_list('pk', flat=True) - return Snapshot.objects.filter(crawl_id__in=crawl_ids) diff --git a/archivebox/seeds/tests.py b/archivebox/seeds/tests.py deleted file mode 100644 index 7ce503c2..00000000 --- a/archivebox/seeds/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/archivebox/seeds/views.py b/archivebox/seeds/views.py deleted file mode 100644 index 91ea44a2..00000000 --- a/archivebox/seeds/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here.