API improvements

2024-11-22 04:03:06 +00:00 · 2024-11-18 04:27:19 -08:00 · 2024-11-18 04:27:19 -08:00 · eeb2671e4d
commit eeb2671e4d
parent c7bd9449d5
7 changed files with 157 additions and 127 deletions
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@ -37,9 +37,9 @@ html_description=f'''
 def register_urls(api: NinjaAPI) -> NinjaAPI:
-    api.add_router('/auth/',     'api.v1_auth.router')
+    # api.add_router('/auth/',     'api.v1_auth.router')
    api.add_router('/core/',     'api.v1_core.router')
-    api.add_router('/crawls/',   'api.v1_core.router')
+    api.add_router('/crawls/',   'api.v1_crawls.router')
    api.add_router('/cli/',      'api.v1_cli.router')
    api.add_router('/jobs/',     'api.v1_actors.router')
    return api
@ -83,7 +83,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
 api = NinjaAPIWithIOCapture(
    title='ArchiveBox API',
    description=html_description,
-    version='1.0.0',
+    version=VERSION,
    csrf=False,
    auth=API_AUTH_METHODS,
    urls_namespace="api-1",
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@ -17,10 +17,10 @@ from archivebox.misc.util import ansi_to_html
 from archivebox.config.common import ARCHIVING_CONFIG
-from .auth import API_AUTH_METHODS
+# from .auth import API_AUTH_METHODS
 # router for API that exposes archivebox cli subcommands as REST endpoints
-router = Router(tags=['ArchiveBox CLI Sub-Commands'], auth=API_AUTH_METHODS)
+router = Router(tags=['ArchiveBox CLI Sub-Commands'])
 # Schemas
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@ -16,12 +16,13 @@ from ninja.errors import HttpError
 from core.models import Snapshot, ArchiveResult, Tag
 from api.models import APIToken, OutboundWebhook
-from crawls.models import Crawl
+from api.v1_crawls import CrawlSchema, SeedSchema
 from seeds.models import Seed
-from .auth import API_AUTH_METHODS
+# from .auth import API_AUTH_METHODS
-router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
+
 router = Router(tags=['Core Models'])
@ -397,108 +398,6 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
        pass
    return tag
 class SeedSchema(Schema):
    TYPE: str = 'seeds.models.Seed'
    id: UUID
    abid: str
    modified_at: datetime
    created_at: datetime
    created_by_id: str
    created_by_username: str
    uri: str
    tags_str: str
    config: dict
    @staticmethod
    def resolve_created_by_id(obj):
        return str(obj.created_by_id)
    @staticmethod
    def resolve_created_by_username(obj):
        User = get_user_model()
        return User.objects.get(id=obj.created_by_id).username
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
 def get_seeds(request):
    return Seed.objects.all().distinct()
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
 def get_seed(request, seed_id: str):
    seed = None
    request.with_snapshots = False
    request.with_archiveresults = False
    try:
        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
    except Exception:
        pass
    return seed
 class CrawlSchema(Schema):
    TYPE: str = 'core.models.Crawl'
    id: UUID
    abid: str
    modified_at: datetime
    created_at: datetime
    created_by_id: str
    created_by_username: str
    status: str
    retry_at: datetime | None
    seed: SeedSchema
    max_depth: int
    # snapshots: List[SnapshotSchema]
    @staticmethod
    def resolve_created_by_id(obj):
        return str(obj.created_by_id)
    @staticmethod
    def resolve_created_by_username(obj):
        User = get_user_model()
        return User.objects.get(id=obj.created_by_id).username
    @staticmethod
    def resolve_snapshots(obj, context):
        if context['request'].with_snapshots:
            return obj.snapshot_set.all().distinct()
        return Snapshot.objects.none()
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
 def get_crawls(request):
    return Crawl.objects.all().distinct()
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
 def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
    """Get a specific Crawl by id or abid."""
    crawl = None
    request.with_snapshots = with_snapshots
    request.with_archiveresults = with_archiveresults
    try:
        crawl = Crawl.objects.get(abid__icontains=crawl_id)
    except Exception:
        pass
    try:
        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
    except Exception:
        pass
    return crawl
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
 def get_any(request, abid: str):
    """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
@ -529,11 +428,13 @@ def get_any(request, abid: str):
        pass
    try:
        from api.v1_crawls import get_seed
        response = response or get_seed(request, abid)
    except Exception:
        pass
    try:
        from api.v1_crawls import get_crawl
        response = response or get_crawl(request, abid)
    except Exception:
        pass
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@ -0,0 +1,119 @@
 __package__ = 'archivebox.api'
 from uuid import UUID
 from typing import List
 from datetime import datetime
 from django.db.models import Q
 from django.contrib.auth import get_user_model
 from ninja import Router, Schema
 from core.models import Snapshot
 from crawls.models import Crawl
 from seeds.models import Seed
 from .auth import API_AUTH_METHODS
 router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
 class SeedSchema(Schema):
    TYPE: str = 'seeds.models.Seed'
    id: UUID
    abid: str
    modified_at: datetime
    created_at: datetime
    created_by_id: str
    created_by_username: str
    uri: str
    tags_str: str
    config: dict
    @staticmethod
    def resolve_created_by_id(obj):
        return str(obj.created_by_id)
    @staticmethod
    def resolve_created_by_username(obj):
        User = get_user_model()
        return User.objects.get(id=obj.created_by_id).username
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
 def get_seeds(request):
    return Seed.objects.all().distinct()
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
 def get_seed(request, seed_id: str):
    seed = None
    request.with_snapshots = False
    request.with_archiveresults = False
    try:
        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
    except Exception:
        pass
    return seed
 class CrawlSchema(Schema):
    TYPE: str = 'core.models.Crawl'
    id: UUID
    abid: str
    modified_at: datetime
    created_at: datetime
    created_by_id: str
    created_by_username: str
    status: str
    retry_at: datetime | None
    seed: SeedSchema
    max_depth: int
    # snapshots: List[SnapshotSchema]
    @staticmethod
    def resolve_created_by_id(obj):
        return str(obj.created_by_id)
    @staticmethod
    def resolve_created_by_username(obj):
        User = get_user_model()
        return User.objects.get(id=obj.created_by_id).username
    @staticmethod
    def resolve_snapshots(obj, context):
        if context['request'].with_snapshots:
            return obj.snapshot_set.all().distinct()
        return Snapshot.objects.none()
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
 def get_crawls(request):
    return Crawl.objects.all().distinct()
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
 def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
    """Get a specific Crawl by id or abid."""
    crawl = None
    request.with_snapshots = with_snapshots
    request.with_archiveresults = with_archiveresults
    try:
        crawl = Crawl.objects.get(abid__icontains=crawl_id)
    except Exception:
        pass
    try:
        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
    except Exception:
        pass
    return crawl
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -40,6 +40,7 @@ def add(urls: str | list[str],
        extractors: str="",
        parser: str="auto",
        persona: str='Default',
        bg: bool=False,
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive"""
@ -51,7 +52,6 @@ def add(urls: str | list[str],
    setup_django()
    check_data_folder()
    from seeds.models import Seed
    from crawls.models import Crawl
    from actors.orchestrator import Orchestrator
@ -83,8 +83,9 @@ def add(urls: str | list[str],
    # from crawls.actors import CrawlActor
    # from core.actors import SnapshotActor, ArchiveResultActor
-    orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=2)
+    if not bg:
-    orchestrator.start()
+        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
        orchestrator.start()
    # 5. return the list of new Snapshots created
    return crawl.snapshot_set.all()
@ -169,6 +170,12 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
        help="Name of accounts persona to use when archiving.",
        default="Default",
    )
    parser.add_argument(
        "--bg",
        default=False,
        action="store_true",
        help="Enqueue a background worker to complete the crawl instead of running it immediately",
    )
    command = parser.parse_args(args or ())
    urls = command.urls
@ -193,6 +200,7 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
        extractors=command.extract,
        parser=command.parser,
        persona=command.persona,
        bg=command.bg,
    )
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@ -23,30 +23,32 @@ urlpatterns = [
    re_path(r"^static/(?P<path>.*)$", serve_static),
    # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
    path('health/', HealthCheckView.as_view(), name='healthcheck'),
    path('error/', lambda *_: 1/0),                                             # type: ignore
    path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
    path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
    path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
    path('archive/', RedirectView.as_view(url='/')),
    path('accounts/', include('django.contrib.auth.urls')),
    path('admin/', archivebox_admin.urls),
    path("api/",      include('api.urls'), name='api'),
    path('public/', PublicIndexView.as_view(), name='public-index'),
    path('archive/', RedirectView.as_view(url='/')),
    path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
    path('add/', AddView.as_view(), name='add'),
    path("jobs/",     JobsDashboardView.as_view(), name='jobs_dashboard'),
    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
    path('accounts/', include('django.contrib.auth.urls')),
    path('admin/', archivebox_admin.urls),
    path("api/",      include('api.urls'), name='api'),
    path('health/', HealthCheckView.as_view(), name='healthcheck'),
    path('error/', lambda *_: 1/0),                                             # type: ignore
    # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django
--- a/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
+++ b/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
@ -185,7 +185,7 @@ class BaseConfigSet(BaseSettings):
            return computed_default
        return value
-    def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
+    def update_in_place(self, warn=False, persist=False, hint='', **kwargs):
        """
        Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
        Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
@ -201,7 +201,7 @@ class BaseConfigSet(BaseSettings):
        if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
            warn = False
-        if warn:
+        if warn or os.environ.get('DEBUG', '').lower() in ('true', '1', 'yes', 'on'):
            fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
            print(f'\n[yellow]:warning:  WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)