mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 04:03:06 +00:00
API improvements
This commit is contained in:
parent
c7bd9449d5
commit
eeb2671e4d
7 changed files with 157 additions and 127 deletions
|
@ -37,9 +37,9 @@ html_description=f'''
|
||||||
|
|
||||||
|
|
||||||
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||||
api.add_router('/auth/', 'api.v1_auth.router')
|
# api.add_router('/auth/', 'api.v1_auth.router')
|
||||||
api.add_router('/core/', 'api.v1_core.router')
|
api.add_router('/core/', 'api.v1_core.router')
|
||||||
api.add_router('/crawls/', 'api.v1_core.router')
|
api.add_router('/crawls/', 'api.v1_crawls.router')
|
||||||
api.add_router('/cli/', 'api.v1_cli.router')
|
api.add_router('/cli/', 'api.v1_cli.router')
|
||||||
api.add_router('/jobs/', 'api.v1_actors.router')
|
api.add_router('/jobs/', 'api.v1_actors.router')
|
||||||
return api
|
return api
|
||||||
|
@ -83,7 +83,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
|
||||||
api = NinjaAPIWithIOCapture(
|
api = NinjaAPIWithIOCapture(
|
||||||
title='ArchiveBox API',
|
title='ArchiveBox API',
|
||||||
description=html_description,
|
description=html_description,
|
||||||
version='1.0.0',
|
version=VERSION,
|
||||||
csrf=False,
|
csrf=False,
|
||||||
auth=API_AUTH_METHODS,
|
auth=API_AUTH_METHODS,
|
||||||
urls_namespace="api-1",
|
urls_namespace="api-1",
|
||||||
|
|
|
@ -17,10 +17,10 @@ from archivebox.misc.util import ansi_to_html
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
|
|
||||||
from .auth import API_AUTH_METHODS
|
# from .auth import API_AUTH_METHODS
|
||||||
|
|
||||||
# router for API that exposes archivebox cli subcommands as REST endpoints
|
# router for API that exposes archivebox cli subcommands as REST endpoints
|
||||||
router = Router(tags=['ArchiveBox CLI Sub-Commands'], auth=API_AUTH_METHODS)
|
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
|
||||||
|
|
||||||
|
|
||||||
# Schemas
|
# Schemas
|
||||||
|
|
|
@ -16,12 +16,13 @@ from ninja.errors import HttpError
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult, Tag
|
from core.models import Snapshot, ArchiveResult, Tag
|
||||||
from api.models import APIToken, OutboundWebhook
|
from api.models import APIToken, OutboundWebhook
|
||||||
from crawls.models import Crawl
|
from api.v1_crawls import CrawlSchema, SeedSchema
|
||||||
from seeds.models import Seed
|
|
||||||
|
|
||||||
from .auth import API_AUTH_METHODS
|
# from .auth import API_AUTH_METHODS
|
||||||
|
|
||||||
router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
|
|
||||||
|
|
||||||
|
router = Router(tags=['Core Models'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -397,108 +398,6 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
|
||||||
pass
|
pass
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SeedSchema(Schema):
|
|
||||||
TYPE: str = 'seeds.models.Seed'
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
abid: str
|
|
||||||
|
|
||||||
modified_at: datetime
|
|
||||||
created_at: datetime
|
|
||||||
created_by_id: str
|
|
||||||
created_by_username: str
|
|
||||||
|
|
||||||
uri: str
|
|
||||||
tags_str: str
|
|
||||||
config: dict
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_created_by_id(obj):
|
|
||||||
return str(obj.created_by_id)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_created_by_username(obj):
|
|
||||||
User = get_user_model()
|
|
||||||
return User.objects.get(id=obj.created_by_id).username
|
|
||||||
|
|
||||||
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
|
|
||||||
def get_seeds(request):
|
|
||||||
return Seed.objects.all().distinct()
|
|
||||||
|
|
||||||
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
|
|
||||||
def get_seed(request, seed_id: str):
|
|
||||||
seed = None
|
|
||||||
request.with_snapshots = False
|
|
||||||
request.with_archiveresults = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return seed
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlSchema(Schema):
|
|
||||||
TYPE: str = 'core.models.Crawl'
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
abid: str
|
|
||||||
|
|
||||||
modified_at: datetime
|
|
||||||
created_at: datetime
|
|
||||||
created_by_id: str
|
|
||||||
created_by_username: str
|
|
||||||
|
|
||||||
status: str
|
|
||||||
retry_at: datetime | None
|
|
||||||
|
|
||||||
seed: SeedSchema
|
|
||||||
max_depth: int
|
|
||||||
|
|
||||||
# snapshots: List[SnapshotSchema]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_created_by_id(obj):
|
|
||||||
return str(obj.created_by_id)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_created_by_username(obj):
|
|
||||||
User = get_user_model()
|
|
||||||
return User.objects.get(id=obj.created_by_id).username
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_snapshots(obj, context):
|
|
||||||
if context['request'].with_snapshots:
|
|
||||||
return obj.snapshot_set.all().distinct()
|
|
||||||
return Snapshot.objects.none()
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
|
|
||||||
def get_crawls(request):
|
|
||||||
return Crawl.objects.all().distinct()
|
|
||||||
|
|
||||||
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
|
|
||||||
def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
|
|
||||||
"""Get a specific Crawl by id or abid."""
|
|
||||||
|
|
||||||
crawl = None
|
|
||||||
request.with_snapshots = with_snapshots
|
|
||||||
request.with_archiveresults = with_archiveresults
|
|
||||||
|
|
||||||
try:
|
|
||||||
crawl = Crawl.objects.get(abid__icontains=crawl_id)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return crawl
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
|
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
|
||||||
def get_any(request, abid: str):
|
def get_any(request, abid: str):
|
||||||
"""Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
|
"""Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
|
||||||
|
@ -529,11 +428,13 @@ def get_any(request, abid: str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
from api.v1_crawls import get_seed
|
||||||
response = response or get_seed(request, abid)
|
response = response or get_seed(request, abid)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
from api.v1_crawls import get_crawl
|
||||||
response = response or get_crawl(request, abid)
|
response = response or get_crawl(request, abid)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
119
archivebox/api/v1_crawls.py
Normal file
119
archivebox/api/v1_crawls.py
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from uuid import UUID
|
||||||
|
from typing import List
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from django.db.models import Q
|
||||||
|
from django.contrib.auth import get_user_model
|
||||||
|
|
||||||
|
from ninja import Router, Schema
|
||||||
|
|
||||||
|
from core.models import Snapshot
|
||||||
|
from crawls.models import Crawl
|
||||||
|
from seeds.models import Seed
|
||||||
|
|
||||||
|
from .auth import API_AUTH_METHODS
|
||||||
|
|
||||||
|
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
|
||||||
|
|
||||||
|
|
||||||
|
class SeedSchema(Schema):
|
||||||
|
TYPE: str = 'seeds.models.Seed'
|
||||||
|
|
||||||
|
id: UUID
|
||||||
|
abid: str
|
||||||
|
|
||||||
|
modified_at: datetime
|
||||||
|
created_at: datetime
|
||||||
|
created_by_id: str
|
||||||
|
created_by_username: str
|
||||||
|
|
||||||
|
uri: str
|
||||||
|
tags_str: str
|
||||||
|
config: dict
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created_by_id(obj):
|
||||||
|
return str(obj.created_by_id)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created_by_username(obj):
|
||||||
|
User = get_user_model()
|
||||||
|
return User.objects.get(id=obj.created_by_id).username
|
||||||
|
|
||||||
|
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
|
||||||
|
def get_seeds(request):
|
||||||
|
return Seed.objects.all().distinct()
|
||||||
|
|
||||||
|
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
|
||||||
|
def get_seed(request, seed_id: str):
|
||||||
|
seed = None
|
||||||
|
request.with_snapshots = False
|
||||||
|
request.with_archiveresults = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return seed
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlSchema(Schema):
|
||||||
|
TYPE: str = 'core.models.Crawl'
|
||||||
|
|
||||||
|
id: UUID
|
||||||
|
abid: str
|
||||||
|
|
||||||
|
modified_at: datetime
|
||||||
|
created_at: datetime
|
||||||
|
created_by_id: str
|
||||||
|
created_by_username: str
|
||||||
|
|
||||||
|
status: str
|
||||||
|
retry_at: datetime | None
|
||||||
|
|
||||||
|
seed: SeedSchema
|
||||||
|
max_depth: int
|
||||||
|
|
||||||
|
# snapshots: List[SnapshotSchema]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created_by_id(obj):
|
||||||
|
return str(obj.created_by_id)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created_by_username(obj):
|
||||||
|
User = get_user_model()
|
||||||
|
return User.objects.get(id=obj.created_by_id).username
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_snapshots(obj, context):
|
||||||
|
if context['request'].with_snapshots:
|
||||||
|
return obj.snapshot_set.all().distinct()
|
||||||
|
return Snapshot.objects.none()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
|
||||||
|
def get_crawls(request):
|
||||||
|
return Crawl.objects.all().distinct()
|
||||||
|
|
||||||
|
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
|
||||||
|
def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
|
||||||
|
"""Get a specific Crawl by id or abid."""
|
||||||
|
|
||||||
|
crawl = None
|
||||||
|
request.with_snapshots = with_snapshots
|
||||||
|
request.with_archiveresults = with_archiveresults
|
||||||
|
|
||||||
|
try:
|
||||||
|
crawl = Crawl.objects.get(abid__icontains=crawl_id)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return crawl
|
||||||
|
|
|
@ -40,6 +40,7 @@ def add(urls: str | list[str],
|
||||||
extractors: str="",
|
extractors: str="",
|
||||||
parser: str="auto",
|
parser: str="auto",
|
||||||
persona: str='Default',
|
persona: str='Default',
|
||||||
|
bg: bool=False,
|
||||||
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
||||||
"""Add a new URL or list of URLs to your archive"""
|
"""Add a new URL or list of URLs to your archive"""
|
||||||
|
|
||||||
|
@ -51,7 +52,6 @@ def add(urls: str | list[str],
|
||||||
setup_django()
|
setup_django()
|
||||||
check_data_folder()
|
check_data_folder()
|
||||||
|
|
||||||
|
|
||||||
from seeds.models import Seed
|
from seeds.models import Seed
|
||||||
from crawls.models import Crawl
|
from crawls.models import Crawl
|
||||||
from actors.orchestrator import Orchestrator
|
from actors.orchestrator import Orchestrator
|
||||||
|
@ -83,8 +83,9 @@ def add(urls: str | list[str],
|
||||||
# from crawls.actors import CrawlActor
|
# from crawls.actors import CrawlActor
|
||||||
# from core.actors import SnapshotActor, ArchiveResultActor
|
# from core.actors import SnapshotActor, ArchiveResultActor
|
||||||
|
|
||||||
orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=2)
|
if not bg:
|
||||||
orchestrator.start()
|
orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
|
||||||
|
orchestrator.start()
|
||||||
|
|
||||||
# 5. return the list of new Snapshots created
|
# 5. return the list of new Snapshots created
|
||||||
return crawl.snapshot_set.all()
|
return crawl.snapshot_set.all()
|
||||||
|
@ -169,6 +170,12 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
|
||||||
help="Name of accounts persona to use when archiving.",
|
help="Name of accounts persona to use when archiving.",
|
||||||
default="Default",
|
default="Default",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--bg",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="Enqueue a background worker to complete the crawl instead of running it immediately",
|
||||||
|
)
|
||||||
command = parser.parse_args(args or ())
|
command = parser.parse_args(args or ())
|
||||||
urls = command.urls
|
urls = command.urls
|
||||||
|
|
||||||
|
@ -193,6 +200,7 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
|
||||||
extractors=command.extract,
|
extractors=command.extract,
|
||||||
parser=command.parser,
|
parser=command.parser,
|
||||||
persona=command.persona,
|
persona=command.persona,
|
||||||
|
bg=command.bg,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -23,30 +23,32 @@ urlpatterns = [
|
||||||
re_path(r"^static/(?P<path>.*)$", serve_static),
|
re_path(r"^static/(?P<path>.*)$", serve_static),
|
||||||
# re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
|
# re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
|
||||||
|
|
||||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
|
||||||
path('error/', lambda *_: 1/0), # type: ignore
|
|
||||||
path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
|
path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
|
||||||
path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
|
path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
|
||||||
|
|
||||||
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
|
|
||||||
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
|
|
||||||
|
|
||||||
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
|
||||||
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
|
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
|
||||||
path('archive/', RedirectView.as_view(url='/')),
|
|
||||||
|
|
||||||
path('accounts/', include('django.contrib.auth.urls')),
|
|
||||||
path('admin/', archivebox_admin.urls),
|
|
||||||
path("api/", include('api.urls'), name='api'),
|
|
||||||
|
|
||||||
path('public/', PublicIndexView.as_view(), name='public-index'),
|
path('public/', PublicIndexView.as_view(), name='public-index'),
|
||||||
|
|
||||||
|
path('archive/', RedirectView.as_view(url='/')),
|
||||||
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
|
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
|
||||||
|
|
||||||
|
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
||||||
path('add/', AddView.as_view(), name='add'),
|
path('add/', AddView.as_view(), name='add'),
|
||||||
|
|
||||||
path("jobs/", JobsDashboardView.as_view(), name='jobs_dashboard'),
|
path("jobs/", JobsDashboardView.as_view(), name='jobs_dashboard'),
|
||||||
|
|
||||||
|
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
|
||||||
|
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
|
||||||
|
|
||||||
|
|
||||||
|
path('accounts/', include('django.contrib.auth.urls')),
|
||||||
|
path('admin/', archivebox_admin.urls),
|
||||||
|
|
||||||
|
path("api/", include('api.urls'), name='api'),
|
||||||
|
|
||||||
|
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||||
|
path('error/', lambda *_: 1/0), # type: ignore
|
||||||
|
|
||||||
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||||
|
|
||||||
|
|
|
@ -185,7 +185,7 @@ class BaseConfigSet(BaseSettings):
|
||||||
return computed_default
|
return computed_default
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
|
def update_in_place(self, warn=False, persist=False, hint='', **kwargs):
|
||||||
"""
|
"""
|
||||||
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
|
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
|
||||||
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
|
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
|
||||||
|
@ -201,7 +201,7 @@ class BaseConfigSet(BaseSettings):
|
||||||
if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
|
if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
|
||||||
warn = False
|
warn = False
|
||||||
|
|
||||||
if warn:
|
if warn or os.environ.get('DEBUG', '').lower() in ('true', '1', 'yes', 'on'):
|
||||||
fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
|
fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
|
||||||
print(f'\n[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
|
print(f'\n[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue