API improvements

2024-11-21 19:53:06 +00:00 · 2024-11-18 04:27:19 -08:00 · 2024-11-18 04:27:19 -08:00 · eeb2671e4d
commit eeb2671e4d
parent c7bd9449d5
7 changed files with 157 additions and 127 deletions
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@ -37,9 +37,9 @@ html_description=f'''


 def register_urls(api: NinjaAPI) -> NinjaAPI:
-    api.add_router('/auth/',     'api.v1_auth.router')
+    # api.add_router('/auth/',     'api.v1_auth.router')
    api.add_router('/core/',     'api.v1_core.router')
-    api.add_router('/crawls/',   'api.v1_core.router')
+    api.add_router('/crawls/',   'api.v1_crawls.router')
    api.add_router('/cli/',      'api.v1_cli.router')
    api.add_router('/jobs/',     'api.v1_actors.router')
    return api
@ -83,7 +83,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
 api = NinjaAPIWithIOCapture(
    title='ArchiveBox API',
    description=html_description,
-    version='1.0.0',
+    version=VERSION,
    csrf=False,
    auth=API_AUTH_METHODS,
    urls_namespace="api-1",
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@ -17,10 +17,10 @@ from archivebox.misc.util import ansi_to_html
 from archivebox.config.common import ARCHIVING_CONFIG


-from .auth import API_AUTH_METHODS
+# from .auth import API_AUTH_METHODS

 # router for API that exposes archivebox cli subcommands as REST endpoints
-router = Router(tags=['ArchiveBox CLI Sub-Commands'], auth=API_AUTH_METHODS)
+router = Router(tags=['ArchiveBox CLI Sub-Commands'])


 # Schemas
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@ -16,12 +16,13 @@ from ninja.errors import HttpError

 from core.models import Snapshot, ArchiveResult, Tag
 from api.models import APIToken, OutboundWebhook
-from crawls.models import Crawl
-from seeds.models import Seed
+from api.v1_crawls import CrawlSchema, SeedSchema

-from .auth import API_AUTH_METHODS
+# from .auth import API_AUTH_METHODS

-router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
+
+
+router = Router(tags=['Core Models'])



@ -397,108 +398,6 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
        pass
    return tag

-
-
-class SeedSchema(Schema):
-    TYPE: str = 'seeds.models.Seed'
-
-    id: UUID
-    abid: str
-    
-    modified_at: datetime
-    created_at: datetime
-    created_by_id: str
-    created_by_username: str
-    
-    uri: str
-    tags_str: str
-    config: dict
-    
-    @staticmethod
-    def resolve_created_by_id(obj):
-        return str(obj.created_by_id)
-    
-    @staticmethod
-    def resolve_created_by_username(obj):
-        User = get_user_model()
-        return User.objects.get(id=obj.created_by_id).username
-    
-@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
-def get_seeds(request):
-    return Seed.objects.all().distinct()
-
-@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
-def get_seed(request, seed_id: str):
-    seed = None
-    request.with_snapshots = False
-    request.with_archiveresults = False
-    
-    try:
-        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
-    except Exception:
-        pass
-    return seed
-
-
-class CrawlSchema(Schema):
-    TYPE: str = 'core.models.Crawl'
-
-    id: UUID
-    abid: str
-
-    modified_at: datetime
-    created_at: datetime
-    created_by_id: str
-    created_by_username: str
-    
-    status: str
-    retry_at: datetime | None
-
-    seed: SeedSchema
-    max_depth: int
-    
-    # snapshots: List[SnapshotSchema]
-
-    @staticmethod
-    def resolve_created_by_id(obj):
-        return str(obj.created_by_id)
-    
-    @staticmethod
-    def resolve_created_by_username(obj):
-        User = get_user_model()
-        return User.objects.get(id=obj.created_by_id).username
-    
-    @staticmethod
-    def resolve_snapshots(obj, context):
-        if context['request'].with_snapshots:
-            return obj.snapshot_set.all().distinct()
-        return Snapshot.objects.none()
-
-
-@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
-def get_crawls(request):
-    return Crawl.objects.all().distinct()
-
-@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
-def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
-    """Get a specific Crawl by id or abid."""
-    
-    crawl = None
-    request.with_snapshots = with_snapshots
-    request.with_archiveresults = with_archiveresults
-    
-    try:
-        crawl = Crawl.objects.get(abid__icontains=crawl_id)
-    except Exception:
-        pass
-
-    try:
-        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
-    except Exception:
-        pass
-    return crawl
-
-
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
 def get_any(request, abid: str):
    """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
@ -529,11 +428,13 @@ def get_any(request, abid: str):
        pass
    
    try:
+        from api.v1_crawls import get_seed
        response = response or get_seed(request, abid)
    except Exception:
        pass
    
    try:
+        from api.v1_crawls import get_crawl
        response = response or get_crawl(request, abid)
    except Exception:
        pass
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@ -0,0 +1,119 @@
+__package__ = 'archivebox.api'
+
+from uuid import UUID
+from typing import List
+from datetime import datetime
+
+from django.db.models import Q
+from django.contrib.auth import get_user_model
+
+from ninja import Router, Schema
+
+from core.models import Snapshot
+from crawls.models import Crawl
+from seeds.models import Seed
+
+from .auth import API_AUTH_METHODS
+
+router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
+
+
+class SeedSchema(Schema):
+    TYPE: str = 'seeds.models.Seed'
+
+    id: UUID
+    abid: str
+    
+    modified_at: datetime
+    created_at: datetime
+    created_by_id: str
+    created_by_username: str
+    
+    uri: str
+    tags_str: str
+    config: dict
+    
+    @staticmethod
+    def resolve_created_by_id(obj):
+        return str(obj.created_by_id)
+    
+    @staticmethod
+    def resolve_created_by_username(obj):
+        User = get_user_model()
+        return User.objects.get(id=obj.created_by_id).username
+    
+@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
+def get_seeds(request):
+    return Seed.objects.all().distinct()
+
+@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
+def get_seed(request, seed_id: str):
+    seed = None
+    request.with_snapshots = False
+    request.with_archiveresults = False
+    
+    try:
+        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
+    except Exception:
+        pass
+    return seed
+
+
+class CrawlSchema(Schema):
+    TYPE: str = 'core.models.Crawl'
+
+    id: UUID
+    abid: str
+
+    modified_at: datetime
+    created_at: datetime
+    created_by_id: str
+    created_by_username: str
+    
+    status: str
+    retry_at: datetime | None
+
+    seed: SeedSchema
+    max_depth: int
+    
+    # snapshots: List[SnapshotSchema]
+
+    @staticmethod
+    def resolve_created_by_id(obj):
+        return str(obj.created_by_id)
+    
+    @staticmethod
+    def resolve_created_by_username(obj):
+        User = get_user_model()
+        return User.objects.get(id=obj.created_by_id).username
+    
+    @staticmethod
+    def resolve_snapshots(obj, context):
+        if context['request'].with_snapshots:
+            return obj.snapshot_set.all().distinct()
+        return Snapshot.objects.none()
+
+
+@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
+def get_crawls(request):
+    return Crawl.objects.all().distinct()
+
+@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
+def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
+    """Get a specific Crawl by id or abid."""
+    
+    crawl = None
+    request.with_snapshots = with_snapshots
+    request.with_archiveresults = with_archiveresults
+    
+    try:
+        crawl = Crawl.objects.get(abid__icontains=crawl_id)
+    except Exception:
+        pass
+
+    try:
+        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
+    except Exception:
+        pass
+    return crawl
+
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -40,6 +40,7 @@ def add(urls: str | list[str],
        extractors: str="",
        parser: str="auto",
        persona: str='Default',
+        bg: bool=False,
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive"""

@ -51,7 +52,6 @@ def add(urls: str | list[str],
    setup_django()
    check_data_folder()
    
-    
    from seeds.models import Seed
    from crawls.models import Crawl
    from actors.orchestrator import Orchestrator
@ -83,8 +83,9 @@ def add(urls: str | list[str],
    # from crawls.actors import CrawlActor
    # from core.actors import SnapshotActor, ArchiveResultActor

-    orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=2)
-    orchestrator.start()
+    if not bg:
+        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
+        orchestrator.start()
    
    # 5. return the list of new Snapshots created
    return crawl.snapshot_set.all()
@ -169,6 +170,12 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
        help="Name of accounts persona to use when archiving.",
        default="Default",
    )
+    parser.add_argument(
+        "--bg",
+        default=False,
+        action="store_true",
+        help="Enqueue a background worker to complete the crawl instead of running it immediately",
+    )
    command = parser.parse_args(args or ())
    urls = command.urls

@ -193,6 +200,7 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
        extractors=command.extract,
        parser=command.parser,
        persona=command.persona,
+        bg=command.bg,
    )


--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@ -23,30 +23,32 @@ urlpatterns = [
    re_path(r"^static/(?P<path>.*)$", serve_static),
    # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),

-    path('health/', HealthCheckView.as_view(), name='healthcheck'),
-    path('error/', lambda *_: 1/0),                                             # type: ignore
    path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
    path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),

-    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
-    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
-
-    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
    path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
-    path('archive/', RedirectView.as_view(url='/')),
-    
-    path('accounts/', include('django.contrib.auth.urls')),
-    path('admin/', archivebox_admin.urls),
-    path("api/",      include('api.urls'), name='api'),

    path('public/', PublicIndexView.as_view(), name='public-index'),
    
+    path('archive/', RedirectView.as_view(url='/')),
    path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),

+    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
    path('add/', AddView.as_view(), name='add'),
    
    path("jobs/",     JobsDashboardView.as_view(), name='jobs_dashboard'),

+    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
+    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
+
+
+    path('accounts/', include('django.contrib.auth.urls')),
+    path('admin/', archivebox_admin.urls),
+    
+    path("api/",      include('api.urls'), name='api'),
+
+    path('health/', HealthCheckView.as_view(), name='healthcheck'),
+    path('error/', lambda *_: 1/0),                                             # type: ignore

    # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django

--- a/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
+++ b/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
@ -185,7 +185,7 @@ class BaseConfigSet(BaseSettings):
            return computed_default
        return value
    
-    def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
+    def update_in_place(self, warn=False, persist=False, hint='', **kwargs):
        """
        Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
        Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
@ -201,7 +201,7 @@ class BaseConfigSet(BaseSettings):
        if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
            warn = False
        
-        if warn:
+        if warn or os.environ.get('DEBUG', '').lower() in ('true', '1', 'yes', 'on'):
            fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
            print(f'\n[yellow]:warning:  WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)