add urls log to Crawl model

This commit is contained in:
Nick Sweeting 2024-11-19 06:32:33 -08:00
parent 28386ff172
commit b948e49013
No known key found for this signature in database
6 changed files with 68 additions and 28 deletions

View file

@ -120,7 +120,8 @@ def cli(ctx, help=False):
def main(args=None, prog_name=None): def main(args=None, prog_name=None):
# show `docker run archivebox xyz` in help messages if running in docker # show `docker run archivebox xyz` in help messages if running in docker
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox') IS_TTY = sys.stdin.isatty()
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
try: try:
cli(args=args, prog_name=prog_name) cli(args=args, prog_name=prog_name)

View file

@ -15,9 +15,7 @@ from django.db.models import QuerySet
from archivebox.misc.util import enforce_types, docstring from archivebox.misc.util import enforce_types, docstring
from archivebox import CONSTANTS from archivebox import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.django import setup_django
from archivebox.config.permissions import USER, HOSTNAME from archivebox.config.permissions import USER, HOSTNAME
from archivebox.misc.checks import check_data_folder
from archivebox.parsers import PARSERS from archivebox.parsers import PARSERS
@ -46,12 +44,8 @@ def add(urls: str | list[str],
depth = int(depth) depth = int(depth)
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# 0. setup abx, django, check_data_folder
setup_django()
check_data_folder()
# then import models once django is set up # import models once django is set up
from crawls.models import Seed, Crawl from crawls.models import Seed, Crawl
from workers.orchestrator import Orchestrator from workers.orchestrator import Orchestrator
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk

View file

@ -204,8 +204,13 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
if not self.timestamp: if not self.timestamp:
self.timestamp = str(self.bookmarked_at.timestamp()) self.timestamp = str(self.bookmarked_at.timestamp())
super().save(*args, **kwargs) super().save(*args, **kwargs)
# make sure the crawl has this url in its urls log
if self.crawl and self.url not in self.crawl.urls:
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
def archive(self, overwrite=False, methods=None): def archive(self, overwrite=False, methods=None):
result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods) result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
@ -713,7 +718,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
"""Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend""" """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
super().write_indexes() super().write_indexes()
self.save_search_index() self.save_search_index()
# self.save_outlinks_to_crawl()
# def save_outlinks_to_crawl(self):
# """Save the output of this ArchiveResult to the Crawl's urls field"""
# if self.output_urls:
# self.snapshot.crawl.urls += f'\n{self.url}'
# self.snapshot.crawl.save()
# def migrate_output_dir(self): # def migrate_output_dir(self):
# """Move the output files to the new folder structure if needed""" # """Move the output files to the new folder structure if needed"""
# print(f'{self}.migrate_output_dir()') # print(f'{self}.migrate_output_dir()')

View file

@ -1,6 +1,8 @@
__package__ = 'archivebox.core' __package__ = 'archivebox.core'
import os import os
import sys
from django.utils import timezone
import inspect import inspect
from typing import Callable, get_type_hints from typing import Callable, get_type_hints
from pathlib import Path from pathlib import Path
@ -21,19 +23,18 @@ from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox import archivebox
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from core.models import Snapshot
from core.forms import AddLinkForm
from workers.tasks import bg_add
from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.serve_static import serve_static_with_byterange_support from archivebox.misc.serve_static import serve_static_with_byterange_support
from archivebox.misc.logging_util import printable_filesize from archivebox.misc.logging_util import printable_filesize
from archivebox.search import query_search_index from archivebox.search import query_search_index
from core.models import Snapshot
from core.forms import AddLinkForm
from crawls.models import Seed, Crawl
class HomepageView(View): class HomepageView(View):
def get(self, request): def get(self, request):
@ -450,16 +451,14 @@ class AddView(UserPassesTestMixin, FormView):
} }
def form_valid(self, form): def form_valid(self, form):
from core.admin_archiveresults import result_url urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
parser = form.cleaned_data["parser"] parser = form.cleaned_data["parser"]
tag = form.cleaned_data["tag"] tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1 depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"]) extractors = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = { input_kwargs = {
"urls": url, "urls": urls,
"tag": tag, "tag": tag,
"depth": depth, "depth": depth,
"parser": parser, "parser": parser,
@ -470,17 +469,50 @@ class AddView(UserPassesTestMixin, FormView):
if extractors: if extractors:
input_kwargs.update({"extractors": extractors}) input_kwargs.update({"extractors": extractors})
result = bg_add(input_kwargs, parent_task_id=None)
print('Started background add job:', result) from archivebox.config.permissions import HOSTNAME
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
seed = Seed.from_file(
sources_file,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
parser=parser,
tag=tag,
created_by=self.request.user.pk,
config={
# 'ONLY_NEW': not update,
# 'INDEX_ONLY': index_only,
# 'OVERWRITE': False,
'DEPTH': depth,
'EXTRACTORS': parser,
# 'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
crawl = Crawl.from_seed(seed, max_depth=depth)
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from crawls.actors import CrawlActor
# from core.actors import SnapshotActor, ArchiveResultActor
rough_url_count = url.count('://') rough_url_count = urls.count('://')
messages.success( messages.success(
self.request, self.request,
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results) {result_url(result)}"), mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
) )
# if not bg:
# from workers.orchestrator import Orchestrator
# orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
# orchestrator.start()
return redirect("/admin/core/snapshot/") return redirect(crawl.admin_change_url)
class HealthCheckView(View): class HealthCheckView(View):

View file

@ -70,7 +70,7 @@ class CrawlAdmin(ABIDModelAdmin):
search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri') search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri')
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents') readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents')
fields = ('label', 'notes', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields) fields = ('label', 'notes', 'urls', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at'] ordering = ['-created_at', '-retry_at']

View file

@ -225,6 +225,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl')
label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl') label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have') notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
@ -304,7 +305,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
return Snapshot.objects.get(crawl=self, url=self.seed.uri) return Snapshot.objects.get(crawl=self, url=self.seed.uri)
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
pass pass
root_snapshot, _ = Snapshot.objects.update_or_create( root_snapshot, _ = Snapshot.objects.update_or_create(
crawl=self, crawl=self,
url=self.seed.uri, url=self.seed.uri,