From b948e4901391e3254b932c94362e5c81838cb4c0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 06:32:33 -0800 Subject: [PATCH] add urls log to Crawl model --- archivebox/cli/__init__.py | 3 +- archivebox/cli/archivebox_add.py | 8 +--- archivebox/core/models.py | 14 ++++++- archivebox/core/views.py | 66 ++++++++++++++++++++++++-------- archivebox/crawls/admin.py | 2 +- archivebox/crawls/models.py | 3 +- 6 files changed, 68 insertions(+), 28 deletions(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 18aa277c..24aeab3a 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -120,7 +120,8 @@ def cli(ctx, help=False): def main(args=None, prog_name=None): # show `docker run archivebox xyz` in help messages if running in docker IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') - prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox') + IS_TTY = sys.stdin.isatty() + prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox') try: cli(args=args, prog_name=prog_name) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index cd43865a..708b6a17 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -15,9 +15,7 @@ from django.db.models import QuerySet from archivebox.misc.util import enforce_types, docstring from archivebox import CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG -from archivebox.config.django import setup_django from archivebox.config.permissions import USER, HOSTNAME -from archivebox.misc.checks import check_data_folder from archivebox.parsers import PARSERS @@ -46,12 +44,8 @@ def add(urls: str | list[str], depth = int(depth) assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' - - # 0. setup abx, django, check_data_folder - setup_django() - check_data_folder() - # then import models once django is set up + # import models once django is set up from crawls.models import Seed, Crawl from workers.orchestrator import Orchestrator from archivebox.base_models.models import get_or_create_system_user_pk diff --git a/archivebox/core/models.py b/archivebox/core/models.py index bdf01af4..fc311da5 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -204,8 +204,13 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel): if not self.timestamp: self.timestamp = str(self.bookmarked_at.timestamp()) - + super().save(*args, **kwargs) + + # make sure the crawl has this url in its urls log + if self.crawl and self.url not in self.crawl.urls: + self.crawl.urls += f'\n{self.url}' + self.crawl.save() def archive(self, overwrite=False, methods=None): result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods) @@ -713,7 +718,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel): """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend""" super().write_indexes() self.save_search_index() + # self.save_outlinks_to_crawl() + # def save_outlinks_to_crawl(self): + # """Save the output of this ArchiveResult to the Crawl's urls field""" + # if self.output_urls: + # self.snapshot.crawl.urls += f'\n{self.url}' + # self.snapshot.crawl.save() + # def migrate_output_dir(self): # """Move the output files to the new folder structure if needed""" # print(f'{self}.migrate_output_dir()') diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 3603b43a..171d772c 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1,6 +1,8 @@ __package__ = 'archivebox.core' import os +import sys +from django.utils import timezone import inspect from typing import Callable, get_type_hints from pathlib import Path @@ -21,19 +23,18 @@ from admin_data_views.typing import TableContext, ItemContext from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink import archivebox - -from core.models import Snapshot -from core.forms import AddLinkForm - -from workers.tasks import bg_add - -from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION +from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG from archivebox.misc.util import base_url, htmlencode, ts_to_date_str from archivebox.misc.serve_static import serve_static_with_byterange_support from archivebox.misc.logging_util import printable_filesize from archivebox.search import query_search_index +from core.models import Snapshot +from core.forms import AddLinkForm +from crawls.models import Seed, Crawl + + class HomepageView(View): def get(self, request): @@ -450,16 +451,14 @@ class AddView(UserPassesTestMixin, FormView): } def form_valid(self, form): - from core.admin_archiveresults import result_url - - url = form.cleaned_data["url"] - print(f'[+] Adding URL: {url}') + urls = form.cleaned_data["url"] + print(f'[+] Adding URL: {urls}') parser = form.cleaned_data["parser"] tag = form.cleaned_data["tag"] depth = 0 if form.cleaned_data["depth"] == "0" else 1 extractors = ','.join(form.cleaned_data["archive_methods"]) input_kwargs = { - "urls": url, + "urls": urls, "tag": tag, "depth": depth, "parser": parser, @@ -470,17 +469,50 @@ class AddView(UserPassesTestMixin, FormView): if extractors: input_kwargs.update({"extractors": extractors}) - result = bg_add(input_kwargs, parent_task_id=None) - print('Started background add job:', result) + + from archivebox.config.permissions import HOSTNAME + + + # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt + sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt' + sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) + + # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt + seed = Seed.from_file( + sources_file, + label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}', + parser=parser, + tag=tag, + created_by=self.request.user.pk, + config={ + # 'ONLY_NEW': not update, + # 'INDEX_ONLY': index_only, + # 'OVERWRITE': False, + 'DEPTH': depth, + 'EXTRACTORS': parser, + # 'DEFAULT_PERSONA': persona or 'Default', + }) + # 3. create a new Crawl pointing to the Seed + crawl = Crawl.from_seed(seed, max_depth=depth) + + # 4. start the Orchestrator & wait until it completes + # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... + # from crawls.actors import CrawlActor + # from core.actors import SnapshotActor, ArchiveResultActor + - rough_url_count = url.count('://') + rough_url_count = urls.count('://') messages.success( self.request, - mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results) {result_url(result)}"), + mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"), ) + # if not bg: + # from workers.orchestrator import Orchestrator + # orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4) + # orchestrator.start() - return redirect("/admin/core/snapshot/") + return redirect(crawl.admin_change_url) class HealthCheckView(View): diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 5e785f6a..5fc56c13 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -70,7 +70,7 @@ class CrawlAdmin(ABIDModelAdmin): search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri') readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents') - fields = ('label', 'notes', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields) + fields = ('label', 'notes', 'urls', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields) list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') ordering = ['-created_at', '-retry_at'] diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 802b0be0..5f302cf1 100644 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -225,6 +225,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine): retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) + urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl') label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl') notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have') @@ -304,7 +305,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine): return Snapshot.objects.get(crawl=self, url=self.seed.uri) except Snapshot.DoesNotExist: pass - + root_snapshot, _ = Snapshot.objects.update_or_create( crawl=self, url=self.seed.uri,