mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 19:53:06 +00:00
merge seeds and crawls apps
This commit is contained in:
parent
4c25e90378
commit
65afd405b1
15 changed files with 168 additions and 224 deletions
|
@ -85,7 +85,6 @@ ARCHIVEBOX_BUILTIN_PLUGINS = {
|
||||||
'workers': PACKAGE_DIR / 'workers',
|
'workers': PACKAGE_DIR / 'workers',
|
||||||
'core': PACKAGE_DIR / 'core',
|
'core': PACKAGE_DIR / 'core',
|
||||||
'crawls': PACKAGE_DIR / 'crawls',
|
'crawls': PACKAGE_DIR / 'crawls',
|
||||||
'seeds': PACKAGE_DIR / 'seeds',
|
|
||||||
# 'search': PACKAGE_DIR / 'search',
|
# 'search': PACKAGE_DIR / 'search',
|
||||||
# 'core': PACKAGE_DIR / 'core',
|
# 'core': PACKAGE_DIR / 'core',
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,8 +10,7 @@ from django.contrib.auth import get_user_model
|
||||||
from ninja import Router, Schema
|
from ninja import Router, Schema
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from crawls.models import Crawl
|
from crawls.models import Seed, Crawl
|
||||||
from seeds.models import Seed
|
|
||||||
|
|
||||||
from .auth import API_AUTH_METHODS
|
from .auth import API_AUTH_METHODS
|
||||||
|
|
||||||
|
@ -19,7 +18,7 @@ router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
|
||||||
|
|
||||||
|
|
||||||
class SeedSchema(Schema):
|
class SeedSchema(Schema):
|
||||||
TYPE: str = 'seeds.models.Seed'
|
TYPE: str = 'crawls.models.Seed'
|
||||||
|
|
||||||
id: UUID
|
id: UUID
|
||||||
abid: str
|
abid: str
|
||||||
|
@ -60,7 +59,7 @@ def get_seed(request, seed_id: str):
|
||||||
|
|
||||||
|
|
||||||
class CrawlSchema(Schema):
|
class CrawlSchema(Schema):
|
||||||
TYPE: str = 'core.models.Crawl'
|
TYPE: str = 'crawls.models.Crawl'
|
||||||
|
|
||||||
id: UUID
|
id: UUID
|
||||||
abid: str
|
abid: str
|
||||||
|
|
|
@ -51,8 +51,7 @@ def add(urls: str | list[str],
|
||||||
setup_django()
|
setup_django()
|
||||||
check_data_folder()
|
check_data_folder()
|
||||||
|
|
||||||
from seeds.models import Seed
|
from crawls.models import Seed, Crawl
|
||||||
from crawls.models import Crawl
|
|
||||||
from workers.orchestrator import Orchestrator
|
from workers.orchestrator import Orchestrator
|
||||||
from abid_utils.models import get_or_create_system_user_pk
|
from abid_utils.models import get_or_create_system_user_pk
|
||||||
|
|
||||||
|
|
|
@ -65,8 +65,7 @@ INSTALLED_APPS = [
|
||||||
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||||
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||||
'workers', # handles starting and managing background workers and processes (orchestrators and actors)
|
'workers', # handles starting and managing background workers and processes (orchestrators and actors)
|
||||||
'seeds', # handles Seed model and URL source management
|
'crawls', # handles Seed, Crawl, and CrawlSchedule models and management
|
||||||
'crawls', # handles Crawl and CrawlSchedule models and management
|
|
||||||
'personas', # handles Persona and session management
|
'personas', # handles Persona and session management
|
||||||
'core', # core django model with Snapshot, ArchiveResult, etc.
|
'core', # core django model with Snapshot, ArchiveResult, etc.
|
||||||
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
__package__ = 'archivebox.crawls'
|
__package__ = 'archivebox.crawls'
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
from django.utils.html import format_html, format_html_join
|
from django.utils.html import format_html, format_html_join
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
|
|
||||||
|
@ -10,7 +8,59 @@ from archivebox import DATA_DIR
|
||||||
from abid_utils.admin import ABIDModelAdmin
|
from abid_utils.admin import ABIDModelAdmin
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from crawls.models import Crawl, CrawlSchedule
|
from crawls.models import Seed, Crawl, CrawlSchedule
|
||||||
|
|
||||||
|
|
||||||
|
class SeedAdmin(ABIDModelAdmin):
|
||||||
|
list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
|
||||||
|
sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||||
|
search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||||
|
|
||||||
|
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
||||||
|
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
|
||||||
|
|
||||||
|
list_filter = ('extractor', 'created_by')
|
||||||
|
ordering = ['-created_at']
|
||||||
|
list_per_page = 100
|
||||||
|
actions = ["delete_selected"]
|
||||||
|
|
||||||
|
def num_crawls(self, obj):
|
||||||
|
return obj.crawl_set.count()
|
||||||
|
|
||||||
|
def num_snapshots(self, obj):
|
||||||
|
return obj.snapshot_set.count()
|
||||||
|
|
||||||
|
def scheduled_crawls(self, obj):
|
||||||
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
|
(scheduledcrawl.admin_change_url, scheduledcrawl)
|
||||||
|
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
||||||
|
)) or format_html('<i>No Scheduled Crawls yet...</i>')
|
||||||
|
|
||||||
|
def crawls(self, obj):
|
||||||
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
|
(crawl.admin_change_url, crawl)
|
||||||
|
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
||||||
|
)) or format_html('<i>No Crawls yet...</i>')
|
||||||
|
|
||||||
|
def snapshots(self, obj):
|
||||||
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
|
(snapshot.admin_change_url, snapshot)
|
||||||
|
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||||
|
)) or format_html('<i>No Snapshots yet...</i>')
|
||||||
|
|
||||||
|
def contents(self, obj):
|
||||||
|
if obj.uri.startswith('file:///data/'):
|
||||||
|
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
|
||||||
|
contents = ""
|
||||||
|
try:
|
||||||
|
contents = source_file.read_text().strip()[:14_000]
|
||||||
|
except Exception as e:
|
||||||
|
contents = f'Error reading {source_file}: {e}'
|
||||||
|
|
||||||
|
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
||||||
|
|
||||||
|
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -102,7 +152,8 @@ class CrawlScheduleAdmin(ABIDModelAdmin):
|
||||||
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
|
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
|
||||||
)) or format_html('<i>No Snapshots yet...</i>')
|
)) or format_html('<i>No Snapshots yet...</i>')
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def register_admin(admin_site):
|
def register_admin(admin_site):
|
||||||
|
admin_site.register(Seed, SeedAdmin)
|
||||||
admin_site.register(Crawl, CrawlAdmin)
|
admin_site.register(Crawl, CrawlAdmin)
|
||||||
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
|
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
__package__ = 'archivebox.crawls'
|
__package__ = 'archivebox.crawls'
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
from pathlib import Path
|
||||||
from django_stubs_ext.db.models import TypedModelMeta
|
from django_stubs_ext.db.models import TypedModelMeta
|
||||||
|
|
||||||
from django.db import models
|
from django.db import models
|
||||||
|
@ -12,12 +13,114 @@ from django.utils import timezone
|
||||||
|
|
||||||
from workers.models import ModelWithStateMachine
|
from workers.models import ModelWithStateMachine
|
||||||
|
|
||||||
|
from archivebox.config import CONSTANTS
|
||||||
|
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from core.models import Snapshot, ArchiveResult
|
from core.models import Snapshot, ArchiveResult
|
||||||
|
|
||||||
from seeds.models import Seed
|
|
||||||
|
|
||||||
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
|
|
||||||
|
class Seed(ABIDModel, ModelWithHealthStats):
|
||||||
|
"""
|
||||||
|
A fountain that produces URLs (+metadata) each time it's queried e.g.
|
||||||
|
- file:///data/sources/2024-01-02_11-57-51__cli_add.txt
|
||||||
|
- file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
|
||||||
|
- file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
|
||||||
|
- https://getpocket.com/user/nikisweeting/feed
|
||||||
|
- https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
|
||||||
|
- ...
|
||||||
|
Each query of a Seed can produce the same list of URLs, or a different list each time.
|
||||||
|
The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
|
||||||
|
|
||||||
|
When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
|
||||||
|
The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
|
||||||
|
The outlinks then get turned into new pending Snapshots under the same crawl,
|
||||||
|
and the cycle repeats until Crawl.max_depth.
|
||||||
|
|
||||||
|
Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
|
||||||
|
stateful remote services, files with contents that change, directories that have new files within, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
abid_prefix = 'src_'
|
||||||
|
abid_ts_src = 'self.created_at'
|
||||||
|
abid_uri_src = 'self.uri'
|
||||||
|
abid_subtype_src = 'self.extractor'
|
||||||
|
abid_rand_src = 'self.id'
|
||||||
|
abid_drift_allowed = True
|
||||||
|
|
||||||
|
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
|
uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
|
||||||
|
label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
|
||||||
|
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
|
||||||
|
|
||||||
|
extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
|
||||||
|
tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
|
||||||
|
config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
|
||||||
|
|
||||||
|
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
|
||||||
|
modified_at = models.DateTimeField(auto_now=True)
|
||||||
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
|
||||||
|
|
||||||
|
|
||||||
|
crawl_set: models.Manager['Crawl']
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = 'Seed'
|
||||||
|
verbose_name_plural = 'Seeds'
|
||||||
|
|
||||||
|
unique_together = (('created_by', 'uri', 'extractor'),)
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
|
||||||
|
source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
|
||||||
|
|
||||||
|
seed, _ = cls.objects.get_or_create(
|
||||||
|
label=label or source_file.name,
|
||||||
|
uri=f'file://{source_path}',
|
||||||
|
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
||||||
|
extractor=parser,
|
||||||
|
tags_str=tag,
|
||||||
|
config=config or {},
|
||||||
|
)
|
||||||
|
seed.save()
|
||||||
|
return seed
|
||||||
|
|
||||||
|
@property
|
||||||
|
def source_type(self):
|
||||||
|
# e.g. http/https://
|
||||||
|
# file://
|
||||||
|
# pocketapi://
|
||||||
|
# s3://
|
||||||
|
# etc..
|
||||||
|
return self.uri.split('://', 1)[0].lower()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def api_url(self) -> str:
|
||||||
|
# /api/v1/core/seed/{uulid}
|
||||||
|
return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def api_docs_url(self) -> str:
|
||||||
|
return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
|
||||||
|
from crawls.models import CrawlSchedule
|
||||||
|
return CrawlSchedule.objects.filter(template__seed_id=self.pk)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def snapshot_set(self) -> QuerySet['Snapshot']:
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
crawl_ids = self.crawl_set.values_list('pk', flat=True)
|
||||||
|
return Snapshot.objects.filter(crawl_id__in=crawl_ids)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlSchedule(ABIDModel, ModelWithHealthStats):
|
class CrawlSchedule(ABIDModel, ModelWithHealthStats):
|
||||||
|
|
|
@ -34,7 +34,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
# This should be abstracted by a plugin interface for extractors
|
# TODO: This should be abstracted by a plugin interface for extractors
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def get_indexable_content(results: QuerySet):
|
def get_indexable_content(results: QuerySet):
|
||||||
if not results:
|
if not results:
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
__package__ = 'archivebox.search'
|
__package__ = 'archivebox.search'
|
||||||
|
|
||||||
from django.contrib import messages
|
from django.contrib import messages
|
||||||
|
from django.contrib import admin
|
||||||
|
|
||||||
from archivebox.search import query_search_index
|
from archivebox.search import query_search_index
|
||||||
|
|
||||||
class SearchResultsAdminMixin:
|
class SearchResultsAdminMixin(admin.ModelAdmin):
|
||||||
def get_search_results(self, request, queryset, search_term: str):
|
def get_search_results(self, request, queryset, search_term: str):
|
||||||
"""Enhances the search queryset with results from the search backend"""
|
"""Enhances the search queryset with results from the search backend"""
|
||||||
|
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
|
|
||||||
__package__ = 'archivebox.seeds'
|
|
||||||
__order__ = 100
|
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def register_admin(admin_site):
|
|
||||||
from .admin import register_admin as register_seeds_admin
|
|
||||||
register_seeds_admin(admin_site)
|
|
||||||
|
|
|
@ -1,68 +0,0 @@
|
||||||
__package__ = 'archivebox.seeds'
|
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
from django.utils.html import format_html_join, format_html
|
|
||||||
|
|
||||||
from abid_utils.admin import ABIDModelAdmin
|
|
||||||
|
|
||||||
from archivebox import DATA_DIR
|
|
||||||
|
|
||||||
from seeds.models import Seed
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SeedAdmin(ABIDModelAdmin):
|
|
||||||
list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
|
|
||||||
sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
|
||||||
search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
|
||||||
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
|
|
||||||
|
|
||||||
list_filter = ('extractor', 'created_by')
|
|
||||||
ordering = ['-created_at']
|
|
||||||
list_per_page = 100
|
|
||||||
actions = ["delete_selected"]
|
|
||||||
|
|
||||||
def num_crawls(self, obj):
|
|
||||||
return obj.crawl_set.count()
|
|
||||||
|
|
||||||
def num_snapshots(self, obj):
|
|
||||||
return obj.snapshot_set.count()
|
|
||||||
|
|
||||||
def scheduled_crawls(self, obj):
|
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
|
||||||
(scheduledcrawl.admin_change_url, scheduledcrawl)
|
|
||||||
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
|
||||||
)) or format_html('<i>No Scheduled Crawls yet...</i>')
|
|
||||||
|
|
||||||
def crawls(self, obj):
|
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
|
||||||
(crawl.admin_change_url, crawl)
|
|
||||||
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
|
||||||
)) or format_html('<i>No Crawls yet...</i>')
|
|
||||||
|
|
||||||
def snapshots(self, obj):
|
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
|
||||||
(snapshot.admin_change_url, snapshot)
|
|
||||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
|
||||||
)) or format_html('<i>No Snapshots yet...</i>')
|
|
||||||
|
|
||||||
def contents(self, obj):
|
|
||||||
if obj.uri.startswith('file:///data/'):
|
|
||||||
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
|
|
||||||
contents = ""
|
|
||||||
try:
|
|
||||||
contents = source_file.read_text().strip()[:14_000]
|
|
||||||
except Exception as e:
|
|
||||||
contents = f'Error reading {source_file}: {e}'
|
|
||||||
|
|
||||||
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
|
||||||
|
|
||||||
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
|
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def register_admin(admin_site):
|
|
||||||
admin_site.register(Seed, SeedAdmin)
|
|
|
@ -1,6 +0,0 @@
|
||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
|
|
||||||
class SeedsConfig(AppConfig):
|
|
||||||
default_auto_field = "django.db.models.BigAutoField"
|
|
||||||
name = "seeds"
|
|
|
@ -1,115 +0,0 @@
|
||||||
__package__ = 'archivebox.seeds'
|
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.db import models
|
|
||||||
from django.db.models import QuerySet
|
|
||||||
from django.conf import settings
|
|
||||||
from django.urls import reverse_lazy
|
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS
|
|
||||||
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from crawls.models import Crawl, CrawlSchedule
|
|
||||||
from core.models import Snapshot
|
|
||||||
|
|
||||||
|
|
||||||
class Seed(ABIDModel, ModelWithHealthStats):
|
|
||||||
"""
|
|
||||||
A fountain that produces URLs (+metadata) each time it's queried e.g.
|
|
||||||
- file:///data/sources/2024-01-02_11-57-51__cli_add.txt
|
|
||||||
- file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
|
|
||||||
- file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
|
|
||||||
- https://getpocket.com/user/nikisweeting/feed
|
|
||||||
- https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
|
|
||||||
- ...
|
|
||||||
Each query of a Seed can produce the same list of URLs, or a different list each time.
|
|
||||||
The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
|
|
||||||
|
|
||||||
When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
|
|
||||||
The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
|
|
||||||
The outlinks then get turned into new pending Snapshots under the same crawl,
|
|
||||||
and the cycle repeats until Crawl.max_depth.
|
|
||||||
|
|
||||||
Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
|
|
||||||
stateful remote services, files with contents that change, directories that have new files within, etc.
|
|
||||||
"""
|
|
||||||
|
|
||||||
abid_prefix = 'src_'
|
|
||||||
abid_ts_src = 'self.created_at'
|
|
||||||
abid_uri_src = 'self.uri'
|
|
||||||
abid_subtype_src = 'self.extractor'
|
|
||||||
abid_rand_src = 'self.id'
|
|
||||||
abid_drift_allowed = True
|
|
||||||
|
|
||||||
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
|
||||||
abid = ABIDField(prefix=abid_prefix)
|
|
||||||
|
|
||||||
uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
|
|
||||||
label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
|
|
||||||
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
|
|
||||||
|
|
||||||
extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
|
|
||||||
tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
|
|
||||||
config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
|
|
||||||
|
|
||||||
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
|
|
||||||
modified_at = models.DateTimeField(auto_now=True)
|
|
||||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
|
|
||||||
|
|
||||||
|
|
||||||
crawl_set: models.Manager['Crawl']
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
verbose_name = 'Seed'
|
|
||||||
verbose_name_plural = 'Seeds'
|
|
||||||
|
|
||||||
unique_together = (('created_by', 'uri', 'extractor'),)
|
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
|
|
||||||
source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
|
|
||||||
|
|
||||||
seed, _ = cls.objects.get_or_create(
|
|
||||||
label=label or source_file.name,
|
|
||||||
uri=f'file://{source_path}',
|
|
||||||
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
|
||||||
extractor=parser,
|
|
||||||
tags_str=tag,
|
|
||||||
config=config or {},
|
|
||||||
)
|
|
||||||
seed.save()
|
|
||||||
return seed
|
|
||||||
|
|
||||||
@property
|
|
||||||
def source_type(self):
|
|
||||||
# e.g. http/https://
|
|
||||||
# file://
|
|
||||||
# pocketapi://
|
|
||||||
# s3://
|
|
||||||
# etc..
|
|
||||||
return self.uri.split('://', 1)[0].lower()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def api_url(self) -> str:
|
|
||||||
# /api/v1/core/seed/{uulid}
|
|
||||||
return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
|
|
||||||
|
|
||||||
@property
|
|
||||||
def api_docs_url(self) -> str:
|
|
||||||
return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
|
|
||||||
|
|
||||||
@property
|
|
||||||
def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
|
|
||||||
from crawls.models import CrawlSchedule
|
|
||||||
return CrawlSchedule.objects.filter(template__seed_id=self.pk)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def snapshot_set(self) -> QuerySet['Snapshot']:
|
|
||||||
from core.models import Snapshot
|
|
||||||
|
|
||||||
crawl_ids = self.crawl_set.values_list('pk', flat=True)
|
|
||||||
return Snapshot.objects.filter(crawl_id__in=crawl_ids)
|
|
|
@ -1,3 +0,0 @@
|
||||||
from django.test import TestCase
|
|
||||||
|
|
||||||
# Create your tests here.
|
|
|
@ -1,3 +0,0 @@
|
||||||
from django.shortcuts import render
|
|
||||||
|
|
||||||
# Create your views here.
|
|
Loading…
Reference in a new issue