From b948e4901391e3254b932c94362e5c81838cb4c0 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <github@sweeting.me>
Date: Tue, 19 Nov 2024 06:32:33 -0800
Subject: [PATCH] add urls log to Crawl model

---
 archivebox/cli/__init__.py       |  3 +-
 archivebox/cli/archivebox_add.py |  8 +---
 archivebox/core/models.py        | 14 ++++++-
 archivebox/core/views.py         | 66 ++++++++++++++++++++++++--------
 archivebox/crawls/admin.py       |  2 +-
 archivebox/crawls/models.py      |  3 +-
 6 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index 18aa277c..24aeab3a 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -120,7 +120,8 @@ def cli(ctx, help=False):
 def main(args=None, prog_name=None):
     # show `docker run archivebox xyz` in help messages if running in docker
     IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
-    prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox')
+    IS_TTY = sys.stdin.isatty()
+    prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
 
     try:
         cli(args=args, prog_name=prog_name)
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index cd43865a..708b6a17 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -15,9 +15,7 @@ from django.db.models import QuerySet
 from archivebox.misc.util import enforce_types, docstring
 from archivebox import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG
-from archivebox.config.django import setup_django
 from archivebox.config.permissions import USER, HOSTNAME
-from archivebox.misc.checks import check_data_folder
 from archivebox.parsers import PARSERS
 
 
@@ -46,12 +44,8 @@ def add(urls: str | list[str],
     depth = int(depth)
 
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
-
-    # 0. setup abx, django, check_data_folder
-    setup_django()
-    check_data_folder()
     
-    # then import models once django is set up
+    # import models once django is set up
     from crawls.models import Seed, Crawl
     from workers.orchestrator import Orchestrator
     from archivebox.base_models.models import get_or_create_system_user_pk
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index bdf01af4..fc311da5 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -204,8 +204,13 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
             
         if not self.timestamp:
             self.timestamp = str(self.bookmarked_at.timestamp())
-        
+
         super().save(*args, **kwargs)
+        
+        # make sure the crawl has this url in its urls log
+        if self.crawl and self.url not in self.crawl.urls:
+            self.crawl.urls += f'\n{self.url}'
+            self.crawl.save()
 
     def archive(self, overwrite=False, methods=None):
         result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
@@ -713,7 +718,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
         """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
         super().write_indexes()
         self.save_search_index()
+        # self.save_outlinks_to_crawl()
         
+    # def save_outlinks_to_crawl(self):
+    #     """Save the output of this ArchiveResult to the Crawl's urls field"""
+    #     if self.output_urls:
+    #     self.snapshot.crawl.urls += f'\n{self.url}'
+    #     self.snapshot.crawl.save()
+
     # def migrate_output_dir(self):
     #     """Move the output files to the new folder structure if needed"""
     #     print(f'{self}.migrate_output_dir()')
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 3603b43a..171d772c 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -1,6 +1,8 @@
 __package__ = 'archivebox.core'
 
 import os
+import sys
+from django.utils import timezone
 import inspect
 from typing import Callable, get_type_hints
 from pathlib import Path
@@ -21,19 +23,18 @@ from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
 
 import archivebox
-
-from core.models import Snapshot
-from core.forms import AddLinkForm
-
-from workers.tasks import bg_add
-
-from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION
+from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
 from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 from archivebox.misc.serve_static import serve_static_with_byterange_support
 from archivebox.misc.logging_util import printable_filesize
 from archivebox.search import query_search_index
 
+from core.models import Snapshot
+from core.forms import AddLinkForm
+from crawls.models import Seed, Crawl
+
+
 
 class HomepageView(View):
     def get(self, request):
@@ -450,16 +451,14 @@ class AddView(UserPassesTestMixin, FormView):
         }
 
     def form_valid(self, form):
-        from core.admin_archiveresults import result_url
-        
-        url = form.cleaned_data["url"]
-        print(f'[+] Adding URL: {url}')
+        urls = form.cleaned_data["url"]
+        print(f'[+] Adding URL: {urls}')
         parser = form.cleaned_data["parser"]
         tag = form.cleaned_data["tag"]
         depth = 0 if form.cleaned_data["depth"] == "0" else 1
         extractors = ','.join(form.cleaned_data["archive_methods"])
         input_kwargs = {
-            "urls": url,
+            "urls": urls,
             "tag": tag,
             "depth": depth,
             "parser": parser,
@@ -470,17 +469,50 @@ class AddView(UserPassesTestMixin, FormView):
         if extractors:
             input_kwargs.update({"extractors": extractors})
 
-        result = bg_add(input_kwargs, parent_task_id=None)
-        print('Started background add job:', result)
+        
+        from archivebox.config.permissions import HOSTNAME
+    
+    
+        # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
+        sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
+        
+        # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
+        seed = Seed.from_file(
+            sources_file,
+            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
+            parser=parser,
+            tag=tag,
+            created_by=self.request.user.pk,
+            config={
+                # 'ONLY_NEW': not update,
+                # 'INDEX_ONLY': index_only,
+                # 'OVERWRITE': False,
+                'DEPTH': depth,
+                'EXTRACTORS': parser,
+                # 'DEFAULT_PERSONA': persona or 'Default',
+            })
+        # 3. create a new Crawl pointing to the Seed
+        crawl = Crawl.from_seed(seed, max_depth=depth)
+        
+        # 4. start the Orchestrator & wait until it completes
+        #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
+        # from crawls.actors import CrawlActor
+        # from core.actors import SnapshotActor, ArchiveResultActor
+    
 
-        rough_url_count = url.count('://')
+        rough_url_count = urls.count('://')
 
         messages.success(
             self.request,
-            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results) {result_url(result)}"),
+            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
         )
+        # if not bg:
+        #     from workers.orchestrator import Orchestrator
+        #     orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
+        #     orchestrator.start()
 
-        return redirect("/admin/core/snapshot/")
+        return redirect(crawl.admin_change_url)
 
 
 class HealthCheckView(View):
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index 5e785f6a..5fc56c13 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -70,7 +70,7 @@ class CrawlAdmin(ABIDModelAdmin):
     search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri')
     
     readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents')
-    fields = ('label', 'notes', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
+    fields = ('label', 'notes', 'urls', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
 
     list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
     ordering = ['-created_at', '-retry_at']
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 802b0be0..5f302cf1 100644
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -225,6 +225,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
 
     seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
+    urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl')
     
     label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
@@ -304,7 +305,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
             return Snapshot.objects.get(crawl=self, url=self.seed.uri)
         except Snapshot.DoesNotExist:
             pass
-        
+  
         root_snapshot, _ = Snapshot.objects.update_or_create(
             crawl=self,
             url=self.seed.uri,