API improvements

This commit is contained in:
Nick Sweeting 2024-11-18 04:27:19 -08:00
parent c7bd9449d5
commit eeb2671e4d
No known key found for this signature in database
7 changed files with 157 additions and 127 deletions

View file

@ -37,9 +37,9 @@ html_description=f'''
def register_urls(api: NinjaAPI) -> NinjaAPI: def register_urls(api: NinjaAPI) -> NinjaAPI:
api.add_router('/auth/', 'api.v1_auth.router') # api.add_router('/auth/', 'api.v1_auth.router')
api.add_router('/core/', 'api.v1_core.router') api.add_router('/core/', 'api.v1_core.router')
api.add_router('/crawls/', 'api.v1_core.router') api.add_router('/crawls/', 'api.v1_crawls.router')
api.add_router('/cli/', 'api.v1_cli.router') api.add_router('/cli/', 'api.v1_cli.router')
api.add_router('/jobs/', 'api.v1_actors.router') api.add_router('/jobs/', 'api.v1_actors.router')
return api return api
@ -83,7 +83,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
api = NinjaAPIWithIOCapture( api = NinjaAPIWithIOCapture(
title='ArchiveBox API', title='ArchiveBox API',
description=html_description, description=html_description,
version='1.0.0', version=VERSION,
csrf=False, csrf=False,
auth=API_AUTH_METHODS, auth=API_AUTH_METHODS,
urls_namespace="api-1", urls_namespace="api-1",

View file

@ -17,10 +17,10 @@ from archivebox.misc.util import ansi_to_html
from archivebox.config.common import ARCHIVING_CONFIG from archivebox.config.common import ARCHIVING_CONFIG
from .auth import API_AUTH_METHODS # from .auth import API_AUTH_METHODS
# router for API that exposes archivebox cli subcommands as REST endpoints # router for API that exposes archivebox cli subcommands as REST endpoints
router = Router(tags=['ArchiveBox CLI Sub-Commands'], auth=API_AUTH_METHODS) router = Router(tags=['ArchiveBox CLI Sub-Commands'])
# Schemas # Schemas

View file

@ -16,12 +16,13 @@ from ninja.errors import HttpError
from core.models import Snapshot, ArchiveResult, Tag from core.models import Snapshot, ArchiveResult, Tag
from api.models import APIToken, OutboundWebhook from api.models import APIToken, OutboundWebhook
from crawls.models import Crawl from api.v1_crawls import CrawlSchema, SeedSchema
from seeds.models import Seed
from .auth import API_AUTH_METHODS # from .auth import API_AUTH_METHODS
router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
router = Router(tags=['Core Models'])
@ -397,108 +398,6 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
pass pass
return tag return tag
class SeedSchema(Schema):
TYPE: str = 'seeds.models.Seed'
id: UUID
abid: str
modified_at: datetime
created_at: datetime
created_by_id: str
created_by_username: str
uri: str
tags_str: str
config: dict
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_created_by_username(obj):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
def get_seeds(request):
return Seed.objects.all().distinct()
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
def get_seed(request, seed_id: str):
seed = None
request.with_snapshots = False
request.with_archiveresults = False
try:
seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
except Exception:
pass
return seed
class CrawlSchema(Schema):
TYPE: str = 'core.models.Crawl'
id: UUID
abid: str
modified_at: datetime
created_at: datetime
created_by_id: str
created_by_username: str
status: str
retry_at: datetime | None
seed: SeedSchema
max_depth: int
# snapshots: List[SnapshotSchema]
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_created_by_username(obj):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
@staticmethod
def resolve_snapshots(obj, context):
if context['request'].with_snapshots:
return obj.snapshot_set.all().distinct()
return Snapshot.objects.none()
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
def get_crawls(request):
return Crawl.objects.all().distinct()
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
"""Get a specific Crawl by id or abid."""
crawl = None
request.with_snapshots = with_snapshots
request.with_archiveresults = with_archiveresults
try:
crawl = Crawl.objects.get(abid__icontains=crawl_id)
except Exception:
pass
try:
crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
except Exception:
pass
return crawl
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)") @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
def get_any(request, abid: str): def get_any(request, abid: str):
"""Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.).""" """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
@ -529,11 +428,13 @@ def get_any(request, abid: str):
pass pass
try: try:
from api.v1_crawls import get_seed
response = response or get_seed(request, abid) response = response or get_seed(request, abid)
except Exception: except Exception:
pass pass
try: try:
from api.v1_crawls import get_crawl
response = response or get_crawl(request, abid) response = response or get_crawl(request, abid)
except Exception: except Exception:
pass pass

119
archivebox/api/v1_crawls.py Normal file
View file

@ -0,0 +1,119 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List
from datetime import datetime
from django.db.models import Q
from django.contrib.auth import get_user_model
from ninja import Router, Schema
from core.models import Snapshot
from crawls.models import Crawl
from seeds.models import Seed
from .auth import API_AUTH_METHODS
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
class SeedSchema(Schema):
TYPE: str = 'seeds.models.Seed'
id: UUID
abid: str
modified_at: datetime
created_at: datetime
created_by_id: str
created_by_username: str
uri: str
tags_str: str
config: dict
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_created_by_username(obj):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
def get_seeds(request):
return Seed.objects.all().distinct()
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
def get_seed(request, seed_id: str):
seed = None
request.with_snapshots = False
request.with_archiveresults = False
try:
seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
except Exception:
pass
return seed
class CrawlSchema(Schema):
TYPE: str = 'core.models.Crawl'
id: UUID
abid: str
modified_at: datetime
created_at: datetime
created_by_id: str
created_by_username: str
status: str
retry_at: datetime | None
seed: SeedSchema
max_depth: int
# snapshots: List[SnapshotSchema]
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_created_by_username(obj):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
@staticmethod
def resolve_snapshots(obj, context):
if context['request'].with_snapshots:
return obj.snapshot_set.all().distinct()
return Snapshot.objects.none()
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
def get_crawls(request):
return Crawl.objects.all().distinct()
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
"""Get a specific Crawl by id or abid."""
crawl = None
request.with_snapshots = with_snapshots
request.with_archiveresults = with_archiveresults
try:
crawl = Crawl.objects.get(abid__icontains=crawl_id)
except Exception:
pass
try:
crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
except Exception:
pass
return crawl

View file

@ -40,6 +40,7 @@ def add(urls: str | list[str],
extractors: str="", extractors: str="",
parser: str="auto", parser: str="auto",
persona: str='Default', persona: str='Default',
bg: bool=False,
created_by_id: int | None=None) -> QuerySet['Snapshot']: created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive""" """Add a new URL or list of URLs to your archive"""
@ -51,7 +52,6 @@ def add(urls: str | list[str],
setup_django() setup_django()
check_data_folder() check_data_folder()
from seeds.models import Seed from seeds.models import Seed
from crawls.models import Crawl from crawls.models import Crawl
from actors.orchestrator import Orchestrator from actors.orchestrator import Orchestrator
@ -83,8 +83,9 @@ def add(urls: str | list[str],
# from crawls.actors import CrawlActor # from crawls.actors import CrawlActor
# from core.actors import SnapshotActor, ArchiveResultActor # from core.actors import SnapshotActor, ArchiveResultActor
orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=2) if not bg:
orchestrator.start() orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
orchestrator.start()
# 5. return the list of new Snapshots created # 5. return the list of new Snapshots created
return crawl.snapshot_set.all() return crawl.snapshot_set.all()
@ -169,6 +170,12 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
help="Name of accounts persona to use when archiving.", help="Name of accounts persona to use when archiving.",
default="Default", default="Default",
) )
parser.add_argument(
"--bg",
default=False,
action="store_true",
help="Enqueue a background worker to complete the crawl instead of running it immediately",
)
command = parser.parse_args(args or ()) command = parser.parse_args(args or ())
urls = command.urls urls = command.urls
@ -193,6 +200,7 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
extractors=command.extract, extractors=command.extract,
parser=command.parser, parser=command.parser,
persona=command.persona, persona=command.persona,
bg=command.bg,
) )

View file

@ -23,30 +23,32 @@ urlpatterns = [
re_path(r"^static/(?P<path>.*)$", serve_static), re_path(r"^static/(?P<path>.*)$", serve_static),
# re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}), # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda *_: 1/0), # type: ignore
path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}), path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}), path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
path('archive/', RedirectView.as_view(url='/')),
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', archivebox_admin.urls),
path("api/", include('api.urls'), name='api'),
path('public/', PublicIndexView.as_view(), name='public-index'), path('public/', PublicIndexView.as_view(), name='public-index'),
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'), path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'), path('add/', AddView.as_view(), name='add'),
path("jobs/", JobsDashboardView.as_view(), name='jobs_dashboard'), path("jobs/", JobsDashboardView.as_view(), name='jobs_dashboard'),
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', archivebox_admin.urls),
path("api/", include('api.urls'), name='api'),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda *_: 1/0), # type: ignore
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django # path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django

View file

@ -185,7 +185,7 @@ class BaseConfigSet(BaseSettings):
return computed_default return computed_default
return value return value
def update_in_place(self, warn=True, persist=False, hint='', **kwargs): def update_in_place(self, warn=False, persist=False, hint='', **kwargs):
""" """
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime. Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
@ -201,7 +201,7 @@ class BaseConfigSet(BaseSettings):
if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()): if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
warn = False warn = False
if warn: if warn or os.environ.get('DEBUG', '').lower() in ('true', '1', 'yes', 'on'):
fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run' fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
print(f'\n[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr) print(f'\n[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)