merge seeds and crawls apps

2024-11-21 19:53:06 +00:00 · 2024-11-18 19:23:14 -08:00 · 2024-11-18 19:23:14 -08:00 · 65afd405b1
commit 65afd405b1
parent 4c25e90378
15 changed files with 168 additions and 224 deletions
--- a/archivebox/init.py
+++ b/archivebox/init.py
@ -85,7 +85,6 @@ ARCHIVEBOX_BUILTIN_PLUGINS = {
    'workers': PACKAGE_DIR / 'workers',
    'core': PACKAGE_DIR / 'core',
    'crawls': PACKAGE_DIR / 'crawls',
    'seeds': PACKAGE_DIR / 'seeds',
    # 'search': PACKAGE_DIR / 'search',
    # 'core': PACKAGE_DIR / 'core',
 }
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@ -10,8 +10,7 @@ from django.contrib.auth import get_user_model
 from ninja import Router, Schema
 from core.models import Snapshot
-from crawls.models import Crawl
+from crawls.models import Seed, Crawl
 from seeds.models import Seed
 from .auth import API_AUTH_METHODS
@ -19,7 +18,7 @@ router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
 class SeedSchema(Schema):
-    TYPE: str = 'seeds.models.Seed'
+    TYPE: str = 'crawls.models.Seed'
    id: UUID
    abid: str
@ -60,7 +59,7 @@ def get_seed(request, seed_id: str):
 class CrawlSchema(Schema):
-    TYPE: str = 'core.models.Crawl'
+    TYPE: str = 'crawls.models.Crawl'
    id: UUID
    abid: str
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -51,8 +51,7 @@ def add(urls: str | list[str],
    setup_django()
    check_data_folder()
-    from seeds.models import Seed
+    from crawls.models import Seed, Crawl
    from crawls.models import Crawl
    from workers.orchestrator import Orchestrator
    from abid_utils.models import get_or_create_system_user_pk
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -65,8 +65,7 @@ INSTALLED_APPS = [
    'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 
    'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
    'workers',                   # handles starting and managing background workers and processes (orchestrators and actors)
-    'seeds',                     # handles Seed model and URL source management
+    'crawls',                    # handles Seed, Crawl, and CrawlSchedule models and management
    'crawls',                    # handles Crawl and CrawlSchedule models and management
    'personas',                  # handles Persona and session management
    'core',                      # core django model with Snapshot, ArchiveResult, etc.
    'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@ -1,7 +1,5 @@
 __package__ = 'archivebox.crawls'
 import abx
 from django.utils.html import format_html, format_html_join
 from django.contrib import admin
@ -10,7 +8,59 @@ from archivebox import DATA_DIR
 from abid_utils.admin import ABIDModelAdmin
 from core.models import Snapshot
-from crawls.models import Crawl, CrawlSchedule
+from crawls.models import Seed, Crawl, CrawlSchedule
 class SeedAdmin(ABIDModelAdmin):
    list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
    sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
    search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
    readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
    list_filter = ('extractor', 'created_by')
    ordering = ['-created_at']
    list_per_page = 100
    actions = ["delete_selected"]
    def num_crawls(self, obj):
        return obj.crawl_set.count()
    def num_snapshots(self, obj):
        return obj.snapshot_set.count()
    def scheduled_crawls(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (scheduledcrawl.admin_change_url, scheduledcrawl)
            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
        )) or format_html('<i>No Scheduled Crawls yet...</i>')
    def crawls(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (crawl.admin_change_url, crawl)
            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
        )) or format_html('<i>No Crawls yet...</i>')
    def snapshots(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (snapshot.admin_change_url, snapshot)
            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
        )) or format_html('<i>No Snapshots yet...</i>')
    def contents(self, obj):
        if obj.uri.startswith('file:///data/'):
            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
            contents = ""
            try:
                contents = source_file.read_text().strip()[:14_000]
            except Exception as e:
                contents = f'Error reading {source_file}: {e}'
            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
@ -102,7 +152,8 @@ class CrawlScheduleAdmin(ABIDModelAdmin):
            for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
        )) or format_html('<i>No Snapshots yet...</i>')
-@abx.hookimpl
+
 def register_admin(admin_site):
    admin_site.register(Seed, SeedAdmin)
    admin_site.register(Crawl, CrawlAdmin)
    admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@ -1,6 +1,7 @@
 __package__ = 'archivebox.crawls'
 from typing import TYPE_CHECKING
 from pathlib import Path
 from django_stubs_ext.db.models import TypedModelMeta
 from django.db import models
@ -12,12 +13,114 @@ from django.utils import timezone
 from workers.models import ModelWithStateMachine
 from archivebox.config import CONSTANTS
 from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
 if TYPE_CHECKING:
    from core.models import Snapshot, ArchiveResult
 from seeds.models import Seed
-from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
+
 class Seed(ABIDModel, ModelWithHealthStats):
    """
    A fountain that produces URLs (+metadata) each time it's queried e.g.
        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
        - https://getpocket.com/user/nikisweeting/feed
        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
        - ...
    Each query of a Seed can produce the same list of URLs, or a different list each time.
    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
    The outlinks then get turned into new pending Snapshots under the same crawl,
    and the cycle repeats until Crawl.max_depth.
    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
    stateful remote services, files with contents that change, directories that have new files within, etc.
    """
    abid_prefix = 'src_'
    abid_ts_src = 'self.created_at'
    abid_uri_src = 'self.uri'
    abid_subtype_src = 'self.extractor'
    abid_rand_src = 'self.id'
    abid_drift_allowed = True
    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
    abid = ABIDField(prefix=abid_prefix)
    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
    crawl_set: models.Manager['Crawl']
    class Meta:
        verbose_name = 'Seed'
        verbose_name_plural = 'Seeds'
        unique_together = (('created_by', 'uri', 'extractor'),)
    @classmethod
    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
        seed, _ = cls.objects.get_or_create(
            label=label or source_file.name,
            uri=f'file://{source_path}',
            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
            extractor=parser,
            tags_str=tag,
            config=config or {},
        )
        seed.save()
        return seed
    @property
    def source_type(self):
        # e.g. http/https://
        #      file://
        #      pocketapi://
        #      s3://
        #      etc..
        return self.uri.split('://', 1)[0].lower()
    @property
    def api_url(self) -> str:
        # /api/v1/core/seed/{uulid}
        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
    @property
    def api_docs_url(self) -> str:
        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
    @property
    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
        from crawls.models import CrawlSchedule
        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
    @property
    def snapshot_set(self) -> QuerySet['Snapshot']:
        from core.models import Snapshot
        crawl_ids = self.crawl_set.values_list('pk', flat=True)
        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
 class CrawlSchedule(ABIDModel, ModelWithHealthStats):
--- a/archivebox/search/init.py
+++ b/archivebox/search/init.py
@ -34,7 +34,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
    return []
-# This should be abstracted by a plugin interface for extractors
+# TODO: This should be abstracted by a plugin interface for extractors
@enforce_types
 def get_indexable_content(results: QuerySet):
    if not results:
--- a/archivebox/search/admin.py
+++ b/archivebox/search/admin.py
@ -1,10 +1,11 @@
 __package__ = 'archivebox.search'
 from django.contrib import messages
 from django.contrib import admin
 from archivebox.search import query_search_index
-class SearchResultsAdminMixin:
+class SearchResultsAdminMixin(admin.ModelAdmin):
    def get_search_results(self, request, queryset, search_term: str):
        """Enhances the search queryset with results from the search backend"""
--- a/archivebox/seeds/init.py
+++ b/archivebox/seeds/init.py
@ -1,12 +0,0 @@
 __package__ = 'archivebox.seeds'
 __order__ = 100
 import abx
@abx.hookimpl
 def register_admin(admin_site):
    from .admin import register_admin as register_seeds_admin
    register_seeds_admin(admin_site)
--- a/archivebox/seeds/admin.py
+++ b/archivebox/seeds/admin.py
@ -1,68 +0,0 @@
 __package__ = 'archivebox.seeds'
 import abx
 from django.utils.html import format_html_join, format_html
 from abid_utils.admin import ABIDModelAdmin
 from archivebox import DATA_DIR
 from seeds.models import Seed
 class SeedAdmin(ABIDModelAdmin):
    list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
    sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
    search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
    readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
    list_filter = ('extractor', 'created_by')
    ordering = ['-created_at']
    list_per_page = 100
    actions = ["delete_selected"]
    def num_crawls(self, obj):
        return obj.crawl_set.count()
    def num_snapshots(self, obj):
        return obj.snapshot_set.count()
    def scheduled_crawls(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (scheduledcrawl.admin_change_url, scheduledcrawl)
            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
        )) or format_html('<i>No Scheduled Crawls yet...</i>')
    def crawls(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (crawl.admin_change_url, crawl)
            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
        )) or format_html('<i>No Crawls yet...</i>')
    def snapshots(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (snapshot.admin_change_url, snapshot)
            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
        )) or format_html('<i>No Snapshots yet...</i>')
    def contents(self, obj):
        if obj.uri.startswith('file:///data/'):
            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
            contents = ""
            try:
                contents = source_file.read_text().strip()[:14_000]
            except Exception as e:
                contents = f'Error reading {source_file}: {e}'
            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
@abx.hookimpl
 def register_admin(admin_site):
    admin_site.register(Seed, SeedAdmin)
--- a/archivebox/seeds/apps.py
+++ b/archivebox/seeds/apps.py
@ -1,6 +0,0 @@
 from django.apps import AppConfig
 class SeedsConfig(AppConfig):
    default_auto_field = "django.db.models.BigAutoField"
    name = "seeds"
--- a/archivebox/seeds/migrations/init.py
+++ b/archivebox/seeds/migrations/init.py
--- a/archivebox/seeds/models.py
+++ b/archivebox/seeds/models.py
@ -1,115 +0,0 @@
 __package__ = 'archivebox.seeds'
 from typing import TYPE_CHECKING
 from pathlib import Path
 from django.db import models
 from django.db.models import QuerySet
 from django.conf import settings
 from django.urls import reverse_lazy
 from archivebox.config import CONSTANTS
 from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
 if TYPE_CHECKING:
    from crawls.models import Crawl, CrawlSchedule
    from core.models import Snapshot
 class Seed(ABIDModel, ModelWithHealthStats):
    """
    A fountain that produces URLs (+metadata) each time it's queried e.g.
        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
        - https://getpocket.com/user/nikisweeting/feed
        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
        - ...
    Each query of a Seed can produce the same list of URLs, or a different list each time.
    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
    The outlinks then get turned into new pending Snapshots under the same crawl,
    and the cycle repeats until Crawl.max_depth.
    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
    stateful remote services, files with contents that change, directories that have new files within, etc.
    """
    abid_prefix = 'src_'
    abid_ts_src = 'self.created_at'
    abid_uri_src = 'self.uri'
    abid_subtype_src = 'self.extractor'
    abid_rand_src = 'self.id'
    abid_drift_allowed = True
    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
    abid = ABIDField(prefix=abid_prefix)
    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
    crawl_set: models.Manager['Crawl']
    class Meta:
        verbose_name = 'Seed'
        verbose_name_plural = 'Seeds'
        unique_together = (('created_by', 'uri', 'extractor'),)
    @classmethod
    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
        seed, _ = cls.objects.get_or_create(
            label=label or source_file.name,
            uri=f'file://{source_path}',
            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
            extractor=parser,
            tags_str=tag,
            config=config or {},
        )
        seed.save()
        return seed
    @property
    def source_type(self):
        # e.g. http/https://
        #      file://
        #      pocketapi://
        #      s3://
        #      etc..
        return self.uri.split('://', 1)[0].lower()
    @property
    def api_url(self) -> str:
        # /api/v1/core/seed/{uulid}
        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
    @property
    def api_docs_url(self) -> str:
        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
    @property
    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
        from crawls.models import CrawlSchedule
        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
    @property
    def snapshot_set(self) -> QuerySet['Snapshot']:
        from core.models import Snapshot
        crawl_ids = self.crawl_set.values_list('pk', flat=True)
        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
--- a/archivebox/seeds/tests.py
+++ b/archivebox/seeds/tests.py
@ -1,3 +0,0 @@
 from django.test import TestCase
 # Create your tests here.
--- a/archivebox/seeds/views.py
+++ b/archivebox/seeds/views.py
@ -1,3 +0,0 @@
 from django.shortcuts import render
 # Create your views here.
		`@ -1,3 +0,0 @@`
			`from django.test import TestCase`

			`# Create your tests here.`
		`@ -1,3 +0,0 @@`
			`from django.shortcuts import render`

			`# Create your views here.`