merge seeds and crawls apps

This commit is contained in:
Nick Sweeting 2024-11-18 19:23:14 -08:00
parent 4c25e90378
commit 65afd405b1
No known key found for this signature in database
15 changed files with 168 additions and 224 deletions

View file

@ -85,7 +85,6 @@ ARCHIVEBOX_BUILTIN_PLUGINS = {
'workers': PACKAGE_DIR / 'workers', 'workers': PACKAGE_DIR / 'workers',
'core': PACKAGE_DIR / 'core', 'core': PACKAGE_DIR / 'core',
'crawls': PACKAGE_DIR / 'crawls', 'crawls': PACKAGE_DIR / 'crawls',
'seeds': PACKAGE_DIR / 'seeds',
# 'search': PACKAGE_DIR / 'search', # 'search': PACKAGE_DIR / 'search',
# 'core': PACKAGE_DIR / 'core', # 'core': PACKAGE_DIR / 'core',
} }

View file

@ -10,8 +10,7 @@ from django.contrib.auth import get_user_model
from ninja import Router, Schema from ninja import Router, Schema
from core.models import Snapshot from core.models import Snapshot
from crawls.models import Crawl from crawls.models import Seed, Crawl
from seeds.models import Seed
from .auth import API_AUTH_METHODS from .auth import API_AUTH_METHODS
@ -19,7 +18,7 @@ router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
class SeedSchema(Schema): class SeedSchema(Schema):
TYPE: str = 'seeds.models.Seed' TYPE: str = 'crawls.models.Seed'
id: UUID id: UUID
abid: str abid: str
@ -60,7 +59,7 @@ def get_seed(request, seed_id: str):
class CrawlSchema(Schema): class CrawlSchema(Schema):
TYPE: str = 'core.models.Crawl' TYPE: str = 'crawls.models.Crawl'
id: UUID id: UUID
abid: str abid: str

View file

@ -51,8 +51,7 @@ def add(urls: str | list[str],
setup_django() setup_django()
check_data_folder() check_data_folder()
from seeds.models import Seed from crawls.models import Seed, Crawl
from crawls.models import Crawl
from workers.orchestrator import Orchestrator from workers.orchestrator import Orchestrator
from abid_utils.models import get_or_create_system_user_pk from abid_utils.models import get_or_create_system_user_pk

View file

@ -65,8 +65,7 @@ INSTALLED_APPS = [
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'workers', # handles starting and managing background workers and processes (orchestrators and actors) 'workers', # handles starting and managing background workers and processes (orchestrators and actors)
'seeds', # handles Seed model and URL source management 'crawls', # handles Seed, Crawl, and CrawlSchedule models and management
'crawls', # handles Crawl and CrawlSchedule models and management
'personas', # handles Persona and session management 'personas', # handles Persona and session management
'core', # core django model with Snapshot, ArchiveResult, etc. 'core', # core django model with Snapshot, ArchiveResult, etc.
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. 'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.

View file

@ -1,7 +1,5 @@
__package__ = 'archivebox.crawls' __package__ = 'archivebox.crawls'
import abx
from django.utils.html import format_html, format_html_join from django.utils.html import format_html, format_html_join
from django.contrib import admin from django.contrib import admin
@ -10,7 +8,59 @@ from archivebox import DATA_DIR
from abid_utils.admin import ABIDModelAdmin from abid_utils.admin import ABIDModelAdmin
from core.models import Snapshot from core.models import Snapshot
from crawls.models import Crawl, CrawlSchedule from crawls.models import Seed, Crawl, CrawlSchedule
class SeedAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
def num_crawls(self, obj):
return obj.crawl_set.count()
def num_snapshots(self, obj):
return obj.snapshot_set.count()
def scheduled_crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(scheduledcrawl.admin_change_url, scheduledcrawl)
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Scheduled Crawls yet...</i>')
def crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Crawls yet...</i>')
def snapshots(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>')
def contents(self, obj):
if obj.uri.startswith('file:///data/'):
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
contents = ""
try:
contents = source_file.read_text().strip()[:14_000]
except Exception as e:
contents = f'Error reading {source_file}: {e}'
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
@ -102,7 +152,8 @@ class CrawlScheduleAdmin(ABIDModelAdmin):
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20] for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>') )) or format_html('<i>No Snapshots yet...</i>')
@abx.hookimpl
def register_admin(admin_site): def register_admin(admin_site):
admin_site.register(Seed, SeedAdmin)
admin_site.register(Crawl, CrawlAdmin) admin_site.register(Crawl, CrawlAdmin)
admin_site.register(CrawlSchedule, CrawlScheduleAdmin) admin_site.register(CrawlSchedule, CrawlScheduleAdmin)

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.crawls' __package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from pathlib import Path
from django_stubs_ext.db.models import TypedModelMeta from django_stubs_ext.db.models import TypedModelMeta
from django.db import models from django.db import models
@ -12,12 +13,114 @@ from django.utils import timezone
from workers.models import ModelWithStateMachine from workers.models import ModelWithStateMachine
from archivebox.config import CONSTANTS
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
if TYPE_CHECKING: if TYPE_CHECKING:
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
from seeds.models import Seed
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
class Seed(ABIDModel, ModelWithHealthStats):
"""
A fountain that produces URLs (+metadata) each time it's queried e.g.
- file:///data/sources/2024-01-02_11-57-51__cli_add.txt
- file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
- file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
- https://getpocket.com/user/nikisweeting/feed
- https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
- ...
Each query of a Seed can produce the same list of URLs, or a different list each time.
The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
The outlinks then get turned into new pending Snapshots under the same crawl,
and the cycle repeats until Crawl.max_depth.
Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
stateful remote services, files with contents that change, directories that have new files within, etc.
"""
abid_prefix = 'src_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.uri'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.id'
abid_drift_allowed = True
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
crawl_set: models.Manager['Crawl']
class Meta:
verbose_name = 'Seed'
verbose_name_plural = 'Seeds'
unique_together = (('created_by', 'uri', 'extractor'),)
@classmethod
def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
seed, _ = cls.objects.get_or_create(
label=label or source_file.name,
uri=f'file://{source_path}',
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
extractor=parser,
tags_str=tag,
config=config or {},
)
seed.save()
return seed
@property
def source_type(self):
# e.g. http/https://
# file://
# pocketapi://
# s3://
# etc..
return self.uri.split('://', 1)[0].lower()
@property
def api_url(self) -> str:
# /api/v1/core/seed/{uulid}
return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
@property
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
@property
def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
from crawls.models import CrawlSchedule
return CrawlSchedule.objects.filter(template__seed_id=self.pk)
@property
def snapshot_set(self) -> QuerySet['Snapshot']:
from core.models import Snapshot
crawl_ids = self.crawl_set.values_list('pk', flat=True)
return Snapshot.objects.filter(crawl_id__in=crawl_ids)
class CrawlSchedule(ABIDModel, ModelWithHealthStats): class CrawlSchedule(ABIDModel, ModelWithHealthStats):

View file

@ -34,7 +34,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
return [] return []
# This should be abstracted by a plugin interface for extractors # TODO: This should be abstracted by a plugin interface for extractors
@enforce_types @enforce_types
def get_indexable_content(results: QuerySet): def get_indexable_content(results: QuerySet):
if not results: if not results:

View file

@ -1,10 +1,11 @@
__package__ = 'archivebox.search' __package__ = 'archivebox.search'
from django.contrib import messages from django.contrib import messages
from django.contrib import admin
from archivebox.search import query_search_index from archivebox.search import query_search_index
class SearchResultsAdminMixin: class SearchResultsAdminMixin(admin.ModelAdmin):
def get_search_results(self, request, queryset, search_term: str): def get_search_results(self, request, queryset, search_term: str):
"""Enhances the search queryset with results from the search backend""" """Enhances the search queryset with results from the search backend"""

View file

@ -1,12 +0,0 @@
__package__ = 'archivebox.seeds'
__order__ = 100
import abx
@abx.hookimpl
def register_admin(admin_site):
from .admin import register_admin as register_seeds_admin
register_seeds_admin(admin_site)

View file

@ -1,68 +0,0 @@
__package__ = 'archivebox.seeds'
import abx
from django.utils.html import format_html_join, format_html
from abid_utils.admin import ABIDModelAdmin
from archivebox import DATA_DIR
from seeds.models import Seed
class SeedAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
def num_crawls(self, obj):
return obj.crawl_set.count()
def num_snapshots(self, obj):
return obj.snapshot_set.count()
def scheduled_crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(scheduledcrawl.admin_change_url, scheduledcrawl)
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Scheduled Crawls yet...</i>')
def crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Crawls yet...</i>')
def snapshots(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>')
def contents(self, obj):
if obj.uri.startswith('file:///data/'):
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
contents = ""
try:
contents = source_file.read_text().strip()[:14_000]
except Exception as e:
contents = f'Error reading {source_file}: {e}'
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(Seed, SeedAdmin)

View file

@ -1,6 +0,0 @@
from django.apps import AppConfig
class SeedsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "seeds"

View file

@ -1,115 +0,0 @@
__package__ = 'archivebox.seeds'
from typing import TYPE_CHECKING
from pathlib import Path
from django.db import models
from django.db.models import QuerySet
from django.conf import settings
from django.urls import reverse_lazy
from archivebox.config import CONSTANTS
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
if TYPE_CHECKING:
from crawls.models import Crawl, CrawlSchedule
from core.models import Snapshot
class Seed(ABIDModel, ModelWithHealthStats):
"""
A fountain that produces URLs (+metadata) each time it's queried e.g.
- file:///data/sources/2024-01-02_11-57-51__cli_add.txt
- file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
- file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
- https://getpocket.com/user/nikisweeting/feed
- https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
- ...
Each query of a Seed can produce the same list of URLs, or a different list each time.
The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
The outlinks then get turned into new pending Snapshots under the same crawl,
and the cycle repeats until Crawl.max_depth.
Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
stateful remote services, files with contents that change, directories that have new files within, etc.
"""
abid_prefix = 'src_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.uri'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.id'
abid_drift_allowed = True
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
crawl_set: models.Manager['Crawl']
class Meta:
verbose_name = 'Seed'
verbose_name_plural = 'Seeds'
unique_together = (('created_by', 'uri', 'extractor'),)
@classmethod
def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
seed, _ = cls.objects.get_or_create(
label=label or source_file.name,
uri=f'file://{source_path}',
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
extractor=parser,
tags_str=tag,
config=config or {},
)
seed.save()
return seed
@property
def source_type(self):
# e.g. http/https://
# file://
# pocketapi://
# s3://
# etc..
return self.uri.split('://', 1)[0].lower()
@property
def api_url(self) -> str:
# /api/v1/core/seed/{uulid}
return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
@property
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
@property
def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
from crawls.models import CrawlSchedule
return CrawlSchedule.objects.filter(template__seed_id=self.pk)
@property
def snapshot_set(self) -> QuerySet['Snapshot']:
from core.models import Snapshot
crawl_ids = self.crawl_set.values_list('pk', flat=True)
return Snapshot.objects.filter(crawl_id__in=crawl_ids)

View file

@ -1,3 +0,0 @@
from django.test import TestCase
# Create your tests here.

View file

@ -1,3 +0,0 @@
from django.shortcuts import render
# Create your views here.